import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import entropy
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on total population (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Switzerland = df[(df.location == "Switzerland")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Serbia = df[(df.location == "Serbia")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Czechia = df[(df.location == "Czechia")]
df_Romania = df[(df.location == "Romania")]
df_Denmark = df[(df.location == "Denmark")]
df_Ireland = df[(df.location == "Ireland")]
df_Estonia = df[(df.location == "Estonia")]
df_Latvia = df[(df.location == "Latvia")]
df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]
df_France = df[(df.location == "France")]
df_Italy = df[(df.location == "Italy")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Sweden = df[(df.location == "Sweden")]
df_Portugal = df[(df.location == "Portugal")]
df_Spain = df[(df.location == "Spain")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Switzerland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322149 |
2078 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991699923252158
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0013425167575155834 R2 Score: 0.9992626021702602 RMSE: 0.036640 Entropy Value: 0.00019452803666383865
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.928385 |
| 6 | median_age | 0.045537 |
| 2 | female_smokers | 0.012387 |
| 0 | cardiovasc_death_rate | 0.009839 |
| 5 | aged_65_older | 0.002963 |
| 3 | male_smokers | 0.000738 |
| 4 | life_expectancy | 0.000151 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Switzerland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 106.749 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 106.749 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 106.749 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 106.749 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 106.749 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 0.322149 |
2078 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9974454800061979
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0020885572459370403 R2 Score: 0.9988528280397101 RMSE: 0.045701 Entropy Value: 0.0004103351755030901
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.860290 |
| 0 | hospital_beds_per_thousand | 0.062671 |
| 2 | extreme_poverty | 0.042865 |
| 3 | gdp_per_capita | 0.032818 |
| 4 | population_density | 0.001356 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Canada'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2132 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989190282407456
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.015185943246172551 R2 Score: 0.9988516813120202 RMSE: 0.123231 Entropy Value: 0.0003900318909582723
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.852780 |
| 0 | cardiovasc_death_rate | 0.064945 |
| 5 | aged_65_older | 0.033227 |
| 6 | median_age | 0.028916 |
| 2 | female_smokers | 0.016257 |
| 3 | male_smokers | 0.003580 |
| 4 | life_expectancy | 0.000296 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Canada'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 1.093162 |
2132 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987706326919069
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.018375766572997825 R2 Score: 0.9986104757656693 RMSE: 0.135557 Entropy Value: 0.0005417545801711733
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.859285 |
| 0 | hospital_beds_per_thousand | 0.098572 |
| 2 | extreme_poverty | 0.027225 |
| 3 | gdp_per_capita | 0.013788 |
| 4 | population_density | 0.001130 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716205 |
2065 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9604734653614051
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0021235139380423036 R2 Score: 0.9989608533920085 RMSE: 0.046082 Entropy Value: 0.00020835376815230597
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.583303 |
| 5 | aged_65_older | 0.190242 |
| 6 | median_age | 0.118268 |
| 2 | female_smokers | 0.043382 |
| 1 | diabetes_prevalence | 0.043157 |
| 4 | life_expectancy | 0.011155 |
| 3 | male_smokers | 0.010494 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 0.716205 |
2065 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9604140465460519
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006126929070284454 R2 Score: 0.9970017726529921 RMSE: 0.078275 Entropy Value: 0.0006566407159103827
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.649173 |
| 1 | human_development_index | 0.179720 |
| 2 | extreme_poverty | 0.087485 |
| 4 | population_density | 0.042793 |
| 3 | gdp_per_capita | 0.040829 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
2068 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9949062193563151
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0010948126043919594 R2 Score: 0.9971226073313326 RMSE: 0.033088 Entropy Value: 0.0007353489441030162
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.692885 |
| 0 | cardiovasc_death_rate | 0.156869 |
| 6 | median_age | 0.062790 |
| 5 | aged_65_older | 0.033893 |
| 2 | female_smokers | 0.031649 |
| 3 | male_smokers | 0.012797 |
| 4 | life_expectancy | 0.009117 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 4.51 | 0.916 | 0.20 | 94277.965 | 231.447 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 4.51 | 0.916 | 0.20 | 94277.965 | 231.447 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 4.51 | 0.916 | 0.20 | 94277.965 | 231.447 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 4.51 | 0.916 | 0.20 | 94277.965 | 231.447 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 4.51 | 0.916 | 0.20 | 94277.965 | 231.447 | 0.377872 |
2068 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9949940053151926
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0014876690956331902 R2 Score: 0.9960900996828079 RMSE: 0.038570 Entropy Value: 0.0011182439482911944
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.638180 |
| 0 | hospital_beds_per_thousand | 0.134692 |
| 2 | extreme_poverty | 0.104035 |
| 4 | population_density | 0.089664 |
| 3 | gdp_per_capita | 0.033430 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Romania'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
2072 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987279834799028
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0035563501611279766 R2 Score: 0.9975591549551829 RMSE: 0.059635 Entropy Value: 0.00035041497875719764
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.598329 |
| 1 | diabetes_prevalence | 0.185027 |
| 5 | aged_65_older | 0.127458 |
| 6 | median_age | 0.069055 |
| 2 | female_smokers | 0.016719 |
| 3 | male_smokers | 0.002718 |
| 4 | life_expectancy | 0.000695 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Romania'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 2.036403 |
2072 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9980242375402296
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0037624148328907537 R2 Score: 0.9974177257060381 RMSE: 0.061339 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.862863 |
| 1 | human_development_index | 0.112243 |
| 2 | extreme_poverty | 0.019148 |
| 3 | gdp_per_capita | 0.004974 |
| 4 | population_density | 0.000772 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Ireland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.9 | 19.677 | 42.3 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.9 | 19.677 | 42.3 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.9 | 19.677 | 42.3 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.9 | 19.677 | 42.3 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.9 | 19.677 | 42.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.3 | 13.928 | 38.7 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.3 | 13.928 | 38.7 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.3 | 13.928 | 38.7 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.3 | 13.928 | 38.7 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.3 | 13.928 | 38.7 | 0.491388 |
2097 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991336553758403
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0039824430069437 R2 Score: 0.9985146782977569 RMSE: 0.063107 Entropy Value: 0.00082019994222745
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.793221 |
| 0 | cardiovasc_death_rate | 0.123281 |
| 6 | median_age | 0.063364 |
| 2 | female_smokers | 0.012535 |
| 3 | male_smokers | 0.003768 |
| 5 | aged_65_older | 0.003571 |
| 4 | life_expectancy | 0.000260 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Ireland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.50 | 0.940 | 0.2 | 46682.515 | 136.520 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 2.50 | 0.940 | 0.2 | 46682.515 | 136.520 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 2.50 | 0.940 | 0.2 | 46682.515 | 136.520 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 2.50 | 0.940 | 0.2 | 46682.515 | 136.520 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 2.50 | 0.940 | 0.2 | 46682.515 | 136.520 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 0.491388 |
2097 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9989935922633931
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0036179936828483242 R2 Score: 0.9986506060409794 RMSE: 0.060150 Entropy Value: 0.000665447331955437
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.731394 |
| 0 | hospital_beds_per_thousand | 0.224461 |
| 2 | extreme_poverty | 0.024741 |
| 3 | gdp_per_capita | 0.016963 |
| 4 | population_density | 0.002441 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9983509839471967
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0011442225986659504 R2 Score: 0.998102656857225 RMSE: 0.033826 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.764753 |
| 0 | cardiovasc_death_rate | 0.126479 |
| 5 | aged_65_older | 0.055045 |
| 6 | median_age | 0.033800 |
| 2 | female_smokers | 0.018890 |
| 3 | male_smokers | 0.000750 |
| 4 | life_expectancy | 0.000282 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 0.631969 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9988172536777192
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0011498507246205968 R2 Score: 0.9980933243320684 RMSE: 0.033909 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.587121 |
| 0 | hospital_beds_per_thousand | 0.383466 |
| 2 | extreme_poverty | 0.025717 |
| 3 | gdp_per_capita | 0.003274 |
| 4 | population_density | 0.000422 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
2102 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9967115293459013
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002631359506918055 R2 Score: 0.9977846875396988 RMSE: 0.051297 Entropy Value: 0.0008231147347694647
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.424154 |
| 0 | cardiovasc_death_rate | 0.346029 |
| 2 | female_smokers | 0.087274 |
| 5 | aged_65_older | 0.081840 |
| 6 | median_age | 0.050001 |
| 3 | male_smokers | 0.008944 |
| 4 | life_expectancy | 0.001758 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 0.11011 |
2102 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9968002075493546
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0025838187798911654 R2 Score: 0.9978247115518787 RMSE: 0.050831 Entropy Value: 0.0013343036085185639
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.747794 |
| 1 | human_development_index | 0.228597 |
| 3 | gdp_per_capita | 0.010147 |
| 2 | extreme_poverty | 0.009692 |
| 4 | population_density | 0.003770 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8377 | France | 1/25/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8378 | France | 1/26/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8379 | France | 1/27/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8380 | France | 1/28/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2135 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9974380799964881
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.04203231904296749 R2 Score: 0.9977496670068304 RMSE: 0.205018 Entropy Value: 0.001029589932983813
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.900030 |
| 6 | median_age | 0.037244 |
| 2 | female_smokers | 0.028587 |
| 5 | aged_65_older | 0.015186 |
| 0 | cardiovasc_death_rate | 0.009428 |
| 3 | male_smokers | 0.006895 |
| 4 | life_expectancy | 0.002631 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 0.000000 |
| 8377 | France | 1/25/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 0.000000 |
| 8378 | France | 1/26/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 0.000000 |
| 8379 | France | 1/27/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 0.000000 |
| 8380 | France | 1/28/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.00 | 35220.084 | 205.859 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.00 | 35220.084 | 205.859 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.00 | 35220.084 | 205.859 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.00 | 35220.084 | 205.859 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.00 | 35220.084 | 205.859 | 0.735109 |
2135 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.997212894312281
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.05589325373552914 R2 Score: 0.9970075780770488 RMSE: 0.236418 Entropy Value: 0.0014961704290713798
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.804554 |
| 0 | hospital_beds_per_thousand | 0.079309 |
| 4 | population_density | 0.068506 |
| 2 | extreme_poverty | 0.039535 |
| 3 | gdp_per_capita | 0.008096 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Sweden'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.816005 |
2100 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9974557008389675
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009632293525443848 R2 Score: 0.9990787800266875 RMSE: 0.098144 Entropy Value: 0.0005803548472550031
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.788294 |
| 2 | female_smokers | 0.150990 |
| 6 | median_age | 0.023584 |
| 0 | cardiovasc_death_rate | 0.017991 |
| 3 | male_smokers | 0.017966 |
| 5 | aged_65_older | 0.000835 |
| 4 | life_expectancy | 0.000340 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Sweden'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 0.816005 |
2100 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990746310006573
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01681748604978763 R2 Score: 0.9983915975972862 RMSE: 0.129682 Entropy Value: 0.0005547247075158208
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.551596 |
| 2 | extreme_poverty | 0.253784 |
| 0 | hospital_beds_per_thousand | 0.164178 |
| 3 | gdp_per_capita | 0.029412 |
| 4 | population_density | 0.001030 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Spain'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
2097 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9991285362384288
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.012129239818308182 R2 Score: 0.9977801811586169 RMSE: 0.110133 Entropy Value: 0.0005886831079683492
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | aged_65_older | 0.558126 |
| 0 | cardiovasc_death_rate | 0.307971 |
| 1 | diabetes_prevalence | 0.111634 |
| 2 | female_smokers | 0.018149 |
| 3 | male_smokers | 0.003648 |
| 6 | median_age | 0.000360 |
| 4 | life_expectancy | 0.000112 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Spain'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 0.855148 |
2097 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9983792131946579
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010138589742890305 R2 Score: 0.9981444976871221 RMSE: 0.100691 Entropy Value: 0.0005635017566477138
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.528876 |
| 0 | hospital_beds_per_thousand | 0.437493 |
| 2 | extreme_poverty | 0.029621 |
| 3 | gdp_per_capita | 0.003685 |
| 4 | population_density | 0.000326 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2091 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984789767307959
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0027083668308768272 R2 Score: 0.9986712195046868 RMSE: 0.052042 Entropy Value: 0.00047135693736216383
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.593303 |
| 1 | diabetes_prevalence | 0.295471 |
| 0 | cardiovasc_death_rate | 0.053646 |
| 5 | aged_65_older | 0.050607 |
| 4 | life_expectancy | 0.003674 |
| 2 | female_smokers | 0.002363 |
| 3 | male_smokers | 0.000937 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 0.536669 |
2091 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.998514703095446
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005931900948646389 R2 Score: 0.9970896873382034 RMSE: 0.077019 Entropy Value: 0.0007898893113525947
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.825169 |
| 0 | hospital_beds_per_thousand | 0.143681 |
| 2 | extreme_poverty | 0.022433 |
| 3 | gdp_per_capita | 0.006149 |
| 4 | population_density | 0.002567 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2136 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9573756200168303
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 3.067522668688002 R2 Score: 0.8735128665771991 RMSE: 1.751434 Entropy Value: 0.00997828364305316
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.515435 |
| 1 | diabetes_prevalence | 0.265632 |
| 2 | female_smokers | 0.128937 |
| 5 | aged_65_older | 0.040833 |
| 6 | median_age | 0.026848 |
| 4 | life_expectancy | 0.013474 |
| 3 | male_smokers | 0.008841 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 1.084791 |
2136 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9555599032367461
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 1.3831340557751377 R2 Score: 0.9429674428683925 RMSE: 1.176067 Entropy Value: 0.007189751715215273
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.471024 |
| 0 | hospital_beds_per_thousand | 0.373521 |
| 2 | extreme_poverty | 0.081580 |
| 4 | population_density | 0.049132 |
| 3 | gdp_per_capita | 0.024743 |
# Country Pair by Pair Analysis relative to cardiovascular death rate
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on cardiovascular death rate (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_France = df[(df.location == "France")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Portugal = df[(df.location == "Portugal")]
df_Spain = df[(df.location == "Spain")]
df_Sweden = df[(df.location == "Sweden")]
df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
df_Czechia = df[(df.location == "Czechia")]
df_Estonia = df[(df.location == "Estonia")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Latvia = df[(df.location == "Latvia")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.998456196460482
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.016140462125189346 R2 Score: 0.9986367589447218 RMSE: 0.127045 Entropy Value: 0.0007186737813811423
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | diabetes_prevalence | 0.571618 |
| 1 | female_smokers | 0.179058 |
| 2 | male_smokers | 0.150706 |
| 5 | median_age | 0.080402 |
| 3 | life_expectancy | 0.018095 |
| 4 | aged_65_older | 0.000120 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985453142557521
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01731215895223598 R2 Score: 0.9985377961513036 RMSE: 0.131576 Entropy Value: 0.000755209259742723
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.604442 |
| 5 | population | 0.227045 |
| 0 | hospital_beds_per_thousand | 0.118292 |
| 2 | extreme_poverty | 0.041327 |
| 3 | gdp_per_capita | 0.008604 |
| 4 | population_density | 0.000290 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Cyprus'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9989867496016483
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0028020652544185898 R2 Score: 0.9991762074874513 RMSE: 0.052935 Entropy Value: 0.0003290685489915789
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | diabetes_prevalence | 0.444854 |
| 1 | female_smokers | 0.370653 |
| 5 | median_age | 0.153361 |
| 2 | male_smokers | 0.025670 |
| 3 | life_expectancy | 0.004770 |
| 4 | aged_65_older | 0.000692 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Cyprus'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989605271130995
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002685282233557505 R2 Score: 0.999210541084082 RMSE: 0.051820 Entropy Value: 0.00034527877599041973
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.655426 |
| 1 | human_development_index | 0.198181 |
| 0 | hospital_beds_per_thousand | 0.115957 |
| 2 | extreme_poverty | 0.020517 |
| 4 | population_density | 0.006514 |
| 3 | gdp_per_capita | 0.003405 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987252225958365
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010232434979697452 R2 Score: 0.9941953413231738 RMSE: 0.101155 Entropy Value: 0.0021676176101701077
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | diabetes_prevalence | 0.491157 |
| 1 | female_smokers | 0.246916 |
| 2 | male_smokers | 0.154249 |
| 5 | median_age | 0.058833 |
| 3 | life_expectancy | 0.048338 |
| 4 | aged_65_older | 0.000507 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984953654596765
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008136863131264145 R2 Score: 0.9953841179278682 RMSE: 0.090205 Entropy Value: 0.0015398143907589235
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.753936 |
| 5 | population | 0.092143 |
| 0 | hospital_beds_per_thousand | 0.070220 |
| 2 | extreme_poverty | 0.061815 |
| 3 | gdp_per_capita | 0.020882 |
| 4 | population_density | 0.001005 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8377 | France | 1/25/2020 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8378 | France | 1/26/2020 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8379 | France | 1/27/2020 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8380 | France | 1/28/2020 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
2107 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9946592139246286
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.053596386319007226 R2 Score: 0.9957380613117373 RMSE: 0.231509 Entropy Value: 0.0014569645741554262
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.568274 |
| 0 | diabetes_prevalence | 0.232028 |
| 1 | female_smokers | 0.185377 |
| 2 | male_smokers | 0.007281 |
| 3 | life_expectancy | 0.004830 |
| 4 | aged_65_older | 0.002209 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8377 | France | 1/25/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8378 | France | 1/26/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8379 | France | 1/27/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8380 | France | 1/28/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2107 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9946715554617531
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.06349754860870421 R2 Score: 0.9949507293753999 RMSE: 0.251987 Entropy Value: 0.002273841175109527
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.739557 |
| 0 | hospital_beds_per_thousand | 0.133697 |
| 4 | population_density | 0.060550 |
| 5 | population | 0.031155 |
| 3 | gdp_per_capita | 0.017913 |
| 2 | extreme_poverty | 0.017128 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989070959122885
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005371809550427108 R2 Score: 0.9995586936801555 RMSE: 0.073293 Entropy Value: 0.00023323823646080252
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.667548 |
| 0 | diabetes_prevalence | 0.180913 |
| 1 | female_smokers | 0.142373 |
| 2 | male_smokers | 0.008080 |
| 3 | life_expectancy | 0.000833 |
| 4 | aged_65_older | 0.000253 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988984971748192
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009889496543689754 R2 Score: 0.9991875554626719 RMSE: 0.099446 Entropy Value: 0.00044270709478587373
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.702587 |
| 0 | hospital_beds_per_thousand | 0.200820 |
| 5 | population | 0.042593 |
| 2 | extreme_poverty | 0.026734 |
| 3 | gdp_per_capita | 0.018469 |
| 4 | population_density | 0.008797 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
2078 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9989525073226527
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004874464208163757 R2 Score: 0.9993646974648677 RMSE: 0.069817 Entropy Value: 0.0008434937353807828
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.811714 |
| 0 | diabetes_prevalence | 0.116426 |
| 2 | male_smokers | 0.038479 |
| 1 | female_smokers | 0.030675 |
| 3 | life_expectancy | 0.002520 |
| 4 | aged_65_older | 0.000185 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
2078 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987243093178199
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007716113263742701 R2 Score: 0.9989943374064345 RMSE: 0.087841 Entropy Value: 0.0017161933932385992
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.482469 |
| 2 | extreme_poverty | 0.244199 |
| 0 | hospital_beds_per_thousand | 0.199908 |
| 5 | population | 0.057787 |
| 3 | gdp_per_capita | 0.014620 |
| 4 | population_density | 0.001017 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Spain'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
2097 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987444044994997
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006938078296368985 R2 Score: 0.9987302355995943 RMSE: 0.083295 Entropy Value: 0.0004794443988216877
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | diabetes_prevalence | 0.444681 |
| 1 | female_smokers | 0.434714 |
| 5 | median_age | 0.093224 |
| 2 | male_smokers | 0.015735 |
| 3 | life_expectancy | 0.011364 |
| 4 | aged_65_older | 0.000283 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Spain'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
2097 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9982848456743826
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.011554150205913471 R2 Score: 0.9978854305210006 RMSE: 0.107490 Entropy Value: 0.0006584258574244559
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.704819 |
| 5 | population | 0.214352 |
| 0 | hospital_beds_per_thousand | 0.044421 |
| 2 | extreme_poverty | 0.030265 |
| 3 | gdp_per_capita | 0.005652 |
| 4 | population_density | 0.000491 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Switzerland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.816005 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984333716901233
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.015931239549295276 R2 Score: 0.9969563630033106 RMSE: 0.126219 Entropy Value: 0.0006754021628688348
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.775295 |
| 0 | diabetes_prevalence | 0.148899 |
| 3 | life_expectancy | 0.030343 |
| 2 | male_smokers | 0.025867 |
| 5 | median_age | 0.019459 |
| 4 | aged_65_older | 0.000137 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Switzerland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985258910599774
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009780822718122402 R2 Score: 0.9981313899781104 RMSE: 0.098898 Entropy Value: 0.0007038834783452404
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.543966 |
| 5 | population | 0.270168 |
| 0 | hospital_beds_per_thousand | 0.096949 |
| 3 | gdp_per_capita | 0.051447 |
| 2 | extreme_poverty | 0.037239 |
| 4 | population_density | 0.000231 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2136 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9572922428501809
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 1.3971853498164353 R2 Score: 0.9423880476703369 RMSE: 1.182026 Entropy Value: 0.006984034800282115
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | diabetes_prevalence | 0.400049 |
| 5 | median_age | 0.369858 |
| 2 | male_smokers | 0.116165 |
| 4 | aged_65_older | 0.060910 |
| 1 | female_smokers | 0.044668 |
| 3 | life_expectancy | 0.008349 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2136 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9566213518011253
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.713063858863409 R2 Score: 0.9705973147727024 RMSE: 0.844431 Entropy Value: 0.008268179051644176
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.302760 |
| 1 | human_development_index | 0.227340 |
| 5 | population | 0.207927 |
| 4 | population_density | 0.146404 |
| 2 | extreme_poverty | 0.077408 |
| 3 | gdp_per_capita | 0.038161 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Estonia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
2095 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987100560424087
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0010058615170550663 R2 Score: 0.9985394731187587 RMSE: 0.031715 Entropy Value: 0.0002910800367932633
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.747054 |
| 0 | diabetes_prevalence | 0.135447 |
| 5 | median_age | 0.081459 |
| 2 | male_smokers | 0.029858 |
| 3 | life_expectancy | 0.005972 |
| 4 | aged_65_older | 0.000209 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Estonia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
2095 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9982992559865769
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0013297376906791 R2 Score: 0.9980691997762051 RMSE: 0.036466 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.691546 |
| 0 | hospital_beds_per_thousand | 0.132334 |
| 5 | population | 0.127958 |
| 2 | extreme_poverty | 0.030192 |
| 3 | gdp_per_capita | 0.017370 |
| 4 | population_density | 0.000600 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2091 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9984547895987792
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002902405614533229 R2 Score: 0.9985760200848308 RMSE: 0.053874 Entropy Value: 0.0005018366914602828
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.778330 |
| 0 | diabetes_prevalence | 0.172858 |
| 5 | median_age | 0.023924 |
| 2 | male_smokers | 0.016332 |
| 3 | life_expectancy | 0.004305 |
| 4 | aged_65_older | 0.004251 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2091 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979685572320796
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010286891805743988 R2 Score: 0.994953039214247 RMSE: 0.101424 Entropy Value: 0.001005217028297138
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.658730 |
| 5 | population | 0.288222 |
| 2 | extreme_poverty | 0.024648 |
| 0 | hospital_beds_per_thousand | 0.015281 |
| 3 | gdp_per_capita | 0.007035 |
| 4 | population_density | 0.006085 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2065 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9581157049903706
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0020134595466409933 R2 Score: 0.9988351175179158 RMSE: 0.044872 Entropy Value: 0.0002481296115330379
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.453982 |
| 0 | diabetes_prevalence | 0.442633 |
| 2 | male_smokers | 0.052401 |
| 1 | female_smokers | 0.027709 |
| 4 | aged_65_older | 0.016088 |
| 3 | life_expectancy | 0.007187 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2065 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9564879049798748
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0038412225079292634 R2 Score: 0.9977776693767008 RMSE: 0.061978 Entropy Value: 0.0004044101196355003
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.829882 |
| 5 | population | 0.121940 |
| 2 | extreme_poverty | 0.019138 |
| 1 | human_development_index | 0.016068 |
| 4 | population_density | 0.008275 |
| 3 | gdp_per_capita | 0.004696 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9982335652486138
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0015563262679148646 R2 Score: 0.9990938937518747 RMSE: 0.039450 Entropy Value: 0.0004499334264955072
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | diabetes_prevalence | 0.470175 |
| 5 | median_age | 0.305786 |
| 1 | female_smokers | 0.213590 |
| 2 | male_smokers | 0.007235 |
| 3 | life_expectancy | 0.002493 |
| 4 | aged_65_older | 0.000721 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979474526774726
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002076185405706244 R2 Score: 0.9987912273877525 RMSE: 0.045565 Entropy Value: 0.00045853981129780964
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.663901 |
| 0 | hospital_beds_per_thousand | 0.186232 |
| 1 | human_development_index | 0.124847 |
| 2 | extreme_poverty | 0.013139 |
| 3 | gdp_per_capita | 0.011260 |
| 4 | population_density | 0.000621 |
# Country Pair by Pair Analysis relative to male smokers
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on male smokers (13 pairs of countries)
df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Czechia = df[(df.location == "Czechia")]
df_Estonia = df[(df.location == "Estonia")]
df_France = df[(df.location == "France")]
df_Italy = df[(df.location == "Italy")]
df_Portugal = df[(df.location == "Portugal")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Spain = df[(df.location == "Spain")]
df_Switzerland = df[(df.location == "Switzerland")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Latvia = df[(df.location == "Latvia")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 6.41 | 19.3 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 6.41 | 19.3 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 6.41 | 19.3 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 6.41 | 19.3 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 6.41 | 19.3 | 80.90 | 19.677 | 42.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 82.43 | 16.984 | 41.4 | 1.093162 |
2134 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991198472250268
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0035622981531189114 R2 Score: 0.999149818574882 RMSE: 0.059685 Entropy Value: 0.000330791989058651
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.743513 |
| 0 | cardiovasc_death_rate | 0.166964 |
| 5 | median_age | 0.068076 |
| 2 | female_smokers | 0.017976 |
| 3 | life_expectancy | 0.003401 |
| 4 | aged_65_older | 0.000071 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.5 | 0.940 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 2.5 | 0.940 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 2.5 | 0.940 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 2.5 | 0.940 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 2.5 | 0.940 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.5 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.5 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.5 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.5 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.5 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2134 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9987973362067224
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0034098009007722373 R2 Score: 0.9991862137124461 RMSE: 0.058394 Entropy Value: 0.00042828048166675814
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.565996 |
| 0 | hospital_beds_per_thousand | 0.211338 |
| 3 | gdp_per_capita | 0.083368 |
| 5 | population | 0.079615 |
| 2 | extreme_poverty | 0.059425 |
| 4 | population_density | 0.000259 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 153.507 | 5.76 | 18.3 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 153.507 | 5.76 | 18.3 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 153.507 | 5.76 | 18.3 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 153.507 | 5.76 | 18.3 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 153.507 | 5.76 | 18.3 | 81.91 | 21.228 | 42.8 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 82.99 | 14.431 | 37.3 | 0.11011 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9976265506566767
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002104558864221546 R2 Score: 0.9982281951732214 RMSE: 0.045875 Entropy Value: 0.0006809600254822203
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.566317 |
| 0 | cardiovasc_death_rate | 0.374238 |
| 5 | median_age | 0.048315 |
| 2 | female_smokers | 0.006486 |
| 3 | life_expectancy | 0.003613 |
| 4 | aged_65_older | 0.001031 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9972033148176618
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004836890703946099 R2 Score: 0.9959278752229046 RMSE: 0.069548 Entropy Value: 0.0012304204035138641
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.327785 |
| 0 | hospital_beds_per_thousand | 0.249805 |
| 2 | extreme_poverty | 0.231512 |
| 5 | population | 0.119151 |
| 3 | gdp_per_capita | 0.062924 |
| 4 | population_density | 0.008824 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 128.275 | 4.42 | 20.9 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 128.275 | 4.42 | 20.9 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 128.275 | 4.42 | 20.9 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 128.275 | 4.42 | 20.9 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 128.275 | 4.42 | 20.9 | 82.25 | 14.312 | 39.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 126.459 | 3.28 | 23.0 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 126.459 | 3.28 | 23.0 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 126.459 | 3.28 | 23.0 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 126.459 | 3.28 | 23.0 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 126.459 | 3.28 | 23.0 | 82.30 | 13.928 | 38.7 | 0.491388 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987986251987184
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0017173908797679875 R2 Score: 0.9992479140176413 RMSE: 0.041441 Entropy Value: 0.00039280120442968804
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.765400 |
| 0 | cardiovasc_death_rate | 0.195268 |
| 1 | diabetes_prevalence | 0.026928 |
| 2 | female_smokers | 0.010463 |
| 3 | life_expectancy | 0.001324 |
| 4 | aged_65_older | 0.000616 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9986051830769147
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0015122598797065847 R2 Score: 0.9993377457219498 RMSE: 0.038888 Entropy Value: 0.0003022118010805689
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.715384 |
| 1 | human_development_index | 0.205967 |
| 0 | hospital_beds_per_thousand | 0.052834 |
| 2 | extreme_poverty | 0.024062 |
| 3 | gdp_per_capita | 0.001458 |
| 4 | population_density | 0.000295 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 81.32 | 19.062 | 44.5 | 0.536669 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9992962940278153
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006026879845882446 R2 Score: 0.9992643953405248 RMSE: 0.077633 Entropy Value: 0.0003719429700690147
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.933080 |
| 0 | cardiovasc_death_rate | 0.038983 |
| 5 | median_age | 0.013890 |
| 2 | female_smokers | 0.012534 |
| 3 | life_expectancy | 0.001336 |
| 4 | aged_65_older | 0.000178 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991623229575961
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0044516891807437255 R2 Score: 0.9994566536271454 RMSE: 0.066721 Entropy Value: 0.00032994017253004367
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.874638 |
| 2 | extreme_poverty | 0.051845 |
| 0 | hospital_beds_per_thousand | 0.046778 |
| 5 | population | 0.018835 |
| 3 | gdp_per_capita | 0.006703 |
| 4 | population_density | 0.001201 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'United Kingdom'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 81.32 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 81.32 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 81.32 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.8 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.8 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.8 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.8 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.8 | 82.80 | 19.985 | 41.0 | 0.816005 |
2126 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9588473335928803
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.47227318644134914 R2 Score: 0.9826662059026536 RMSE: 0.687221 Entropy Value: 0.0030797153679582856
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.414115 |
| 5 | median_age | 0.352922 |
| 2 | female_smokers | 0.062994 |
| 1 | diabetes_prevalence | 0.060655 |
| 3 | life_expectancy | 0.059176 |
| 4 | aged_65_older | 0.050139 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'United Kingdom'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2126 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9546833804604962
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.997926932618917 R2 Score: 0.9633731906218204 RMSE: 0.998963 Entropy Value: 0.005106077515730719
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.576604 |
| 2 | extreme_poverty | 0.127571 |
| 5 | population | 0.122504 |
| 0 | hospital_beds_per_thousand | 0.113870 |
| 4 | population_density | 0.046708 |
| 3 | gdp_per_capita | 0.012743 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United States'
country2 = 'Austria'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 78.86 | 15.413 | 38.3 | 1.084791 |
2112 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.983794331453694
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00854869651988359 R2 Score: 0.9945310004412267 RMSE: 0.092459 Entropy Value: 0.0006678574833760092
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.631029 |
| 5 | median_age | 0.184170 |
| 0 | cardiovasc_death_rate | 0.107669 |
| 2 | female_smokers | 0.031789 |
| 4 | aged_65_older | 0.031600 |
| 3 | life_expectancy | 0.013742 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United States'
country2 = 'Austria'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2112 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9887577648734872
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00406836830426948 R2 Score: 0.9973972751975432 RMSE: 0.063784 Entropy Value: 0.0005141467774508717
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.670685 |
| 5 | population | 0.231413 |
| 4 | population_density | 0.042005 |
| 2 | extreme_poverty | 0.032527 |
| 3 | gdp_per_capita | 0.019548 |
| 0 | hospital_beds_per_thousand | 0.003822 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Czechia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 4.29 | 25.1 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 4.29 | 25.1 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 4.29 | 25.1 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 4.29 | 25.1 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 4.29 | 25.1 | 81.63 | 18.571 | 41.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 227.485 | 6.82 | 30.5 | 79.38 | 19.027 | 43.3 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 227.485 | 6.82 | 30.5 | 79.38 | 19.027 | 43.3 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 227.485 | 6.82 | 30.5 | 79.38 | 19.027 | 43.3 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 227.485 | 6.82 | 30.5 | 79.38 | 19.027 | 43.3 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 227.485 | 6.82 | 30.5 | 79.38 | 19.027 | 43.3 | 0.919575 |
2094 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9987378590313624
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.012446433855461606 R2 Score: 0.9988990709156199 RMSE: 0.111564 Entropy Value: 0.0005707690311533189
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.740967 |
| 0 | cardiovasc_death_rate | 0.208080 |
| 5 | median_age | 0.027986 |
| 2 | female_smokers | 0.021149 |
| 3 | life_expectancy | 0.001681 |
| 4 | aged_65_older | 0.000137 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Czechia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919575 |
2094 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.998309889655373
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.014806362040415705 R2 Score: 0.9986903273023058 RMSE: 0.121681 Entropy Value: 0.0008238184433236955
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.687524 |
| 2 | extreme_poverty | 0.166036 |
| 0 | hospital_beds_per_thousand | 0.087129 |
| 5 | population | 0.043040 |
| 3 | gdp_per_capita | 0.015953 |
| 4 | population_density | 0.000318 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'France'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 78.74 | 19.452 | 42.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 86.060 | 4.77 | 30.1 | 82.66 | 19.718 | 42.0 | 0.411710 |
| 9443 | France | 12/26/2022 | 86.060 | 4.77 | 30.1 | 82.66 | 19.718 | 42.0 | 0.411282 |
| 9444 | France | 12/27/2022 | 86.060 | 4.77 | 30.1 | 82.66 | 19.718 | 42.0 | 0.411730 |
| 9445 | France | 12/28/2022 | 86.060 | 4.77 | 30.1 | 82.66 | 19.718 | 42.0 | 0.411813 |
| 9446 | France | 12/29/2022 | 86.060 | 4.77 | 30.1 | 82.66 | 19.718 | 42.0 | 0.411892 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9972748488300354
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.06576710142677315 R2 Score: 0.9931527190173807 RMSE: 0.256451 Entropy Value: 0.003671594073428902
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.682271 |
| 0 | cardiovasc_death_rate | 0.282642 |
| 5 | median_age | 0.012867 |
| 3 | life_expectancy | 0.010845 |
| 2 | female_smokers | 0.010472 |
| 4 | aged_65_older | 0.000904 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'France'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411710 |
| 9443 | France | 12/26/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411282 |
| 9444 | France | 12/27/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411730 |
| 9445 | France | 12/28/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411813 |
| 9446 | France | 12/29/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411892 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9978195582971618
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.07504882607957856 R2 Score: 0.9921863608333906 RMSE: 0.273950 Entropy Value: 0.004313799679754843
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.760584 |
| 5 | population | 0.117334 |
| 0 | hospital_beds_per_thousand | 0.096855 |
| 2 | extreme_poverty | 0.013718 |
| 3 | gdp_per_capita | 0.011203 |
| 4 | population_density | 0.000306 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Portugal'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 82.05 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2098 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9993040710802911
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.02093963561387409 R2 Score: 0.9981619530362098 RMSE: 0.144705 Entropy Value: 0.0012838103910311198
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.791038 |
| 0 | cardiovasc_death_rate | 0.174679 |
| 1 | diabetes_prevalence | 0.022490 |
| 2 | female_smokers | 0.011428 |
| 3 | life_expectancy | 0.000305 |
| 4 | aged_65_older | 0.000060 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Portugal'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2098 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9992988336861244
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.013594354001948517 R2 Score: 0.9988067098416261 RMSE: 0.116595 Entropy Value: 0.0007082932658725094
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.526697 |
| 1 | human_development_index | 0.421271 |
| 2 | extreme_poverty | 0.034415 |
| 0 | hospital_beds_per_thousand | 0.010278 |
| 3 | gdp_per_capita | 0.007127 |
| 4 | population_density | 0.000212 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 439.415 | 10.08 | 37.7 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 439.415 | 10.08 | 37.7 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 439.415 | 10.08 | 37.7 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 439.415 | 10.08 | 37.7 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 439.415 | 10.08 | 37.7 | 76.00 | 17.366 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 22.9 | 76.05 | 17.850 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 22.9 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 22.9 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 22.9 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 22.9 | 76.05 | 17.850 | 43.0 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9985721206840632
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.001614404165662264 R2 Score: 0.9990600803111381 RMSE: 0.040180 Entropy Value: 0.00041066443993060793
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.450615 |
| 5 | median_age | 0.372406 |
| 1 | diabetes_prevalence | 0.154296 |
| 2 | female_smokers | 0.013306 |
| 3 | life_expectancy | 0.008851 |
| 4 | aged_65_older | 0.000526 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979474526774726
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002076185405706244 R2 Score: 0.9987912273877525 RMSE: 0.045565 Entropy Value: 0.00045853981129780964
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.663901 |
| 0 | hospital_beds_per_thousand | 0.186232 |
| 1 | human_development_index | 0.124847 |
| 2 | extreme_poverty | 0.013139 |
| 3 | gdp_per_capita | 0.011260 |
| 4 | population_density | 0.000621 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Spain'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 77.54 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 27.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 27.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 27.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 27.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 27.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
2092 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9994767987719648
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0015014430402105537 R2 Score: 0.9997248669986107 RMSE: 0.038748 Entropy Value: 0.00014000658074544682
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.886071 |
| 1 | diabetes_prevalence | 0.062596 |
| 0 | cardiovasc_death_rate | 0.043915 |
| 2 | female_smokers | 0.006730 |
| 3 | life_expectancy | 0.000657 |
| 4 | aged_65_older | 0.000031 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Spain'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
2092 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988021206356716
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006694095157429868 R2 Score: 0.99877333575572 RMSE: 0.081817 Entropy Value: 0.00048549470785567154
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.766794 |
| 5 | population | 0.138965 |
| 2 | extreme_poverty | 0.046793 |
| 0 | hospital_beds_per_thousand | 0.046793 |
| 3 | gdp_per_capita | 0.000507 |
| 4 | population_density | 0.000149 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'Bulgaria'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 99.739 | 5.59 | 22.6 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 99.739 | 5.59 | 22.6 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 99.739 | 5.59 | 22.6 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 99.739 | 5.59 | 22.6 | 83.78 | 18.436 | 43.1 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 99.739 | 5.59 | 22.6 | 83.78 | 18.436 | 43.1 | 0.322149 |
2066 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9661827427503995
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00378049910950887 R2 Score: 0.9984788009596705 RMSE: 0.061486 Entropy Value: 0.00038857518870859817
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.327840 |
| 0 | cardiovasc_death_rate | 0.307169 |
| 1 | diabetes_prevalence | 0.301346 |
| 2 | female_smokers | 0.035199 |
| 4 | aged_65_older | 0.017073 |
| 3 | life_expectancy | 0.011373 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'Bulgaria'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 4.530 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 4.530 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 4.530 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 4.530 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 4.530 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322149 |
2066 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9660124510295903
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0036559826087191843 R2 Score: 0.9985289039688287 RMSE: 0.060465 Entropy Value: 0.00028880483478410657
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.513202 |
| 1 | human_development_index | 0.253525 |
| 5 | population | 0.179491 |
| 2 | extreme_poverty | 0.034966 |
| 3 | gdp_per_capita | 0.010116 |
| 4 | population_density | 0.008700 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 75.29 | 19.754 | 43.9 | 0.631969 |
2065 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9968381533668923
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0007853700061385886 R2 Score: 0.9984435932019353 RMSE: 0.028024 Entropy Value: 0.00041728333907532875
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.495285 |
| 5 | median_age | 0.278346 |
| 0 | cardiovasc_death_rate | 0.187393 |
| 2 | female_smokers | 0.032736 |
| 3 | life_expectancy | 0.004431 |
| 4 | aged_65_older | 0.001808 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2065 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9965848123547593
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0010839291039479871 R2 Score: 0.9978519237902916 RMSE: 0.032923 Entropy Value: 0.000511041568557603
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.394633 |
| 0 | hospital_beds_per_thousand | 0.378747 |
| 5 | population | 0.194840 |
| 2 | extreme_poverty | 0.024974 |
| 3 | gdp_per_capita | 0.005458 |
| 4 | population_density | 0.001347 |
# Country Pair by Pair Analysis relative to female smokers
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on female smokers (13 pairs of countries)
df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]
df_Italy = df[(df.location == "Italy")]
df_Portugal = df[(df.location == "Portugal")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Estonia = df[(df.location == "Estonia")]
df_Ireland = df[(df.location == "Ireland")]
df_Latvia = df[(df.location == "Latvia")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Romania = df[(df.location == "Romania")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Spain = df[(df.location == "Spain")]
df_Switzerland = df[(df.location == "Switzerland")]
df_Bulgaria= df[(df.location == "Bulgaria")]
df_Czechia = df[(df.location == "Czechia")]
df_France = df[(df.location == "France")]
df_Serbia = df[(df.location == "Serbia")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Cyprus'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9989396252151874
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0023675274789805804 R2 Score: 0.9993039593180916 RMSE: 0.048657 Entropy Value: 0.0002917411838061955
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.459504 |
| 1 | diabetes_prevalence | 0.417500 |
| 5 | median_age | 0.090781 |
| 2 | male_smokers | 0.026672 |
| 3 | life_expectancy | 0.004896 |
| 4 | aged_65_older | 0.000647 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Cyprus'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989605271130995
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002685282233557505 R2 Score: 0.999210541084082 RMSE: 0.051820 Entropy Value: 0.00034527877599041973
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.655426 |
| 1 | human_development_index | 0.198181 |
| 0 | hospital_beds_per_thousand | 0.115957 |
| 2 | extreme_poverty | 0.020517 |
| 4 | population_density | 0.006514 |
| 3 | gdp_per_capita | 0.003405 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 6.41 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 6.41 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 6.41 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 6.41 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 6.41 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 153.507 | 5.76 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 153.507 | 5.76 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 153.507 | 5.76 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 153.507 | 5.76 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 153.507 | 5.76 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9987535557486945
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007569049722669286 R2 Score: 0.9957062272826365 RMSE: 0.087000 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.807879 |
| 5 | median_age | 0.060286 |
| 2 | male_smokers | 0.056874 |
| 0 | cardiovasc_death_rate | 0.053250 |
| 3 | life_expectancy | 0.021230 |
| 4 | aged_65_older | 0.000481 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984953654596765
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008136863131264145 R2 Score: 0.9953841179278682 RMSE: 0.090205 Entropy Value: 0.0015398143907589235
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.753936 |
| 5 | population | 0.092143 |
| 0 | hospital_beds_per_thousand | 0.070220 |
| 2 | extreme_poverty | 0.061815 |
| 3 | gdp_per_capita | 0.020882 |
| 4 | population_density | 0.001005 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Iceland'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 20911 | Iceland | 2/28/2020 | 117.992 | 5.31 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20912 | Iceland | 2/29/2020 | 117.992 | 5.31 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20913 | Iceland | 3/1/2020 | 117.992 | 5.31 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20914 | Iceland | 3/2/2020 | 117.992 | 5.31 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20915 | Iceland | 3/3/2020 | 117.992 | 5.31 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2100 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9993163648274013
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.015447420587881813 R2 Score: 0.9987332719640899 RMSE: 0.124288 Entropy Value: 0.0007928936109947494
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.604268 |
| 0 | cardiovasc_death_rate | 0.336193 |
| 5 | median_age | 0.046313 |
| 2 | male_smokers | 0.012867 |
| 3 | life_expectancy | 0.000293 |
| 4 | aged_65_older | 0.000066 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Iceland'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 20911 | Iceland | 2/28/2020 | 2.91 | 0.949 | 0.2 | 46482.958 | 3.404 | 372903 | 0.000000 |
| 20912 | Iceland | 2/29/2020 | 2.91 | 0.949 | 0.2 | 46482.958 | 3.404 | 372903 | 0.000000 |
| 20913 | Iceland | 3/1/2020 | 2.91 | 0.949 | 0.2 | 46482.958 | 3.404 | 372903 | 0.000000 |
| 20914 | Iceland | 3/2/2020 | 2.91 | 0.949 | 0.2 | 46482.958 | 3.404 | 372903 | 0.000000 |
| 20915 | Iceland | 3/3/2020 | 2.91 | 0.949 | 0.2 | 46482.958 | 3.404 | 372903 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2100 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9994138916027948
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.03290481728582324 R2 Score: 0.9973017207413157 RMSE: 0.181397 Entropy Value: 0.0017395706724272842
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.549859 |
| 1 | human_development_index | 0.349497 |
| 0 | hospital_beds_per_thousand | 0.042623 |
| 2 | extreme_poverty | 0.033844 |
| 3 | gdp_per_capita | 0.022440 |
| 4 | population_density | 0.001738 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9986102850111352
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0023723489330236868 R2 Score: 0.9988124530308832 RMSE: 0.048707 Entropy Value: 0.0002848751285545325
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.861403 |
| 0 | cardiovasc_death_rate | 0.093971 |
| 3 | life_expectancy | 0.021326 |
| 2 | male_smokers | 0.017737 |
| 5 | median_age | 0.005372 |
| 4 | aged_65_older | 0.000190 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9987931112255966
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002719376915070921 R2 Score: 0.9986387382697272 RMSE: 0.052148 Entropy Value: 0.00030312420047472904
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.727041 |
| 5 | population | 0.146595 |
| 0 | hospital_beds_per_thousand | 0.068287 |
| 2 | extreme_poverty | 0.033873 |
| 3 | gdp_per_capita | 0.023341 |
| 4 | population_density | 0.000863 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'United Kingdom'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 24.7 | 81.32 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 24.7 | 81.32 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 24.7 | 81.32 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.9 | 82.80 | 19.985 | 41.0 | 0.816005 |
2126 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9583436207367914
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.3215249530704957 R2 Score: 0.9881991027784613 RMSE: 0.567032 Entropy Value: 0.002642072340961003
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.372382 |
| 5 | median_age | 0.354078 |
| 2 | male_smokers | 0.071201 |
| 1 | diabetes_prevalence | 0.068132 |
| 4 | aged_65_older | 0.067241 |
| 3 | life_expectancy | 0.066966 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'United Kingdom'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2126 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9546833804604962
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.997926932618917 R2 Score: 0.9633731906218204 RMSE: 0.998963 Entropy Value: 0.005106077515730719
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.576604 |
| 2 | extreme_poverty | 0.127571 |
| 5 | population | 0.122504 |
| 0 | hospital_beds_per_thousand | 0.113870 |
| 4 | population_density | 0.046708 |
| 3 | gdp_per_capita | 0.012743 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United States'
country2 = 'Austria'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2112 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9871989498238769
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0060186676398561656 R2 Score: 0.9961495778227457 RMSE: 0.077580 Entropy Value: 0.0005991889258687465
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.689863 |
| 0 | cardiovasc_death_rate | 0.126162 |
| 5 | median_age | 0.108889 |
| 2 | male_smokers | 0.031105 |
| 4 | aged_65_older | 0.028256 |
| 3 | life_expectancy | 0.015725 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United States'
country2 = 'Austria'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2112 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9887577648734872
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00406836830426948 R2 Score: 0.9973972751975432 RMSE: 0.063784 Entropy Value: 0.0005141467774508717
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.670685 |
| 5 | population | 0.231413 |
| 4 | population_density | 0.042005 |
| 2 | extreme_poverty | 0.032527 |
| 3 | gdp_per_capita | 0.019548 |
| 0 | hospital_beds_per_thousand | 0.003822 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Estonia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 4.29 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 4.29 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 4.29 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 4.29 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 4.29 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 255.569 | 4.02 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 255.569 | 4.02 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 255.569 | 4.02 | 39.3 | 78.74 | 19.452 | 42.7 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 255.569 | 4.02 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 255.569 | 4.02 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
2121 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9988940139433922
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.014213131068063751 R2 Score: 0.9988739751027007 RMSE: 0.119219 Entropy Value: 0.0008755638819072048
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.867373 |
| 0 | cardiovasc_death_rate | 0.099856 |
| 2 | male_smokers | 0.023081 |
| 5 | median_age | 0.008580 |
| 3 | life_expectancy | 0.001066 |
| 4 | aged_65_older | 0.000044 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Estonia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
2121 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985104636986213
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.012715196293674761 R2 Score: 0.9989926478879171 RMSE: 0.112762 Entropy Value: 0.0008232624023628059
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.744249 |
| 0 | hospital_beds_per_thousand | 0.102687 |
| 2 | extreme_poverty | 0.100773 |
| 5 | population | 0.041484 |
| 3 | gdp_per_capita | 0.010488 |
| 4 | population_density | 0.000319 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 126.459 | 3.28 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 126.459 | 3.28 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 126.459 | 3.28 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 126.459 | 3.28 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 126.459 | 3.28 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2073 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9985163505355373
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0019485868775483287 R2 Score: 0.9991159573277734 RMSE: 0.044143 Entropy Value: 0.00044838336973350434
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.492280 |
| 1 | diabetes_prevalence | 0.451240 |
| 5 | median_age | 0.028929 |
| 2 | male_smokers | 0.025303 |
| 3 | life_expectancy | 0.001392 |
| 4 | aged_65_older | 0.000856 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2073 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9984706478843881
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0019969715660626214 R2 Score: 0.9990940059691648 RMSE: 0.044687 Entropy Value: 0.0003069312117218654
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.457937 |
| 0 | hospital_beds_per_thousand | 0.370522 |
| 5 | population | 0.147929 |
| 2 | extreme_poverty | 0.021240 |
| 3 | gdp_per_capita | 0.001512 |
| 4 | population_density | 0.000862 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 128.275 | 4.42 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 128.275 | 4.42 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 128.275 | 4.42 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 128.275 | 4.42 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 128.275 | 4.42 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
2078 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990232848825181
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005227635293595947 R2 Score: 0.9993186676908599 RMSE: 0.072302 Entropy Value: 0.0006067133512587061
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.643346 |
| 0 | cardiovasc_death_rate | 0.233654 |
| 5 | median_age | 0.067902 |
| 2 | male_smokers | 0.048321 |
| 3 | life_expectancy | 0.006387 |
| 4 | aged_65_older | 0.000389 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
2078 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987243093178199
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007716113263742701 R2 Score: 0.9989943374064345 RMSE: 0.087841 Entropy Value: 0.0017161933932385992
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.482469 |
| 2 | extreme_poverty | 0.244199 |
| 0 | hospital_beds_per_thousand | 0.199908 |
| 5 | population | 0.057787 |
| 3 | gdp_per_capita | 0.014620 |
| 4 | population_density | 0.001017 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Slovakia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 37.1 | 76.05 | 17.85 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
2067 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989475810367674
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0009858239346901049 R2 Score: 0.9994441257129435 RMSE: 0.031398 Entropy Value: 0.00012076580400027207
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.772914 |
| 1 | diabetes_prevalence | 0.149105 |
| 0 | cardiovasc_death_rate | 0.068522 |
| 2 | male_smokers | 0.008368 |
| 3 | life_expectancy | 0.000947 |
| 4 | aged_65_older | 0.000144 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Slovakia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.820 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.820 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.820 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.820 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.820 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2067 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.998030341444381
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0026098629278783294 R2 Score: 0.99852838256072 RMSE: 0.051087 Entropy Value: 0.0004080770571716045
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.746990 |
| 1 | human_development_index | 0.214658 |
| 0 | hospital_beds_per_thousand | 0.020482 |
| 2 | extreme_poverty | 0.015312 |
| 3 | gdp_per_capita | 0.002071 |
| 4 | population_density | 0.000487 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Switzerland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 99.739 | 5.59 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 99.739 | 5.59 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 99.739 | 5.59 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 99.739 | 5.59 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 99.739 | 5.59 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9991186723177916
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005248655333758953 R2 Score: 0.9991504781499669 RMSE: 0.072448 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.534733 |
| 5 | median_age | 0.408620 |
| 2 | male_smokers | 0.039876 |
| 1 | diabetes_prevalence | 0.012956 |
| 3 | life_expectancy | 0.003662 |
| 4 | aged_65_older | 0.000153 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Switzerland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.97 | 0.904 | 1.00 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.97 | 0.904 | 1.00 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.97 | 0.904 | 1.00 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.97 | 0.904 | 1.00 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.97 | 0.904 | 1.00 | 34272.360 | 93.105 | 47558632 | 0.855148 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987177894783118
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008170528399905543 R2 Score: 0.9986775579723456 RMSE: 0.090391 Entropy Value: 0.0007620940166931194
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.522591 |
| 5 | population | 0.301978 |
| 0 | hospital_beds_per_thousand | 0.108450 |
| 2 | extreme_poverty | 0.033910 |
| 3 | gdp_per_capita | 0.032646 |
| 4 | population_density | 0.000425 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Czechia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 227.485 | 6.82 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 227.485 | 6.82 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 227.485 | 6.82 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 227.485 | 6.82 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 227.485 | 6.82 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919575 |
2061 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9561647329294217
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002649612864395656 R2 Score: 0.9983839393744748 RMSE: 0.051474 Entropy Value: 0.00027256482883888917
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.464105 |
| 0 | cardiovasc_death_rate | 0.433036 |
| 2 | male_smokers | 0.045127 |
| 4 | aged_65_older | 0.026142 |
| 1 | diabetes_prevalence | 0.021133 |
| 3 | life_expectancy | 0.010457 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Czechia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919575 |
2061 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9555145568773449
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002769256603078144 R2 Score: 0.9983109658703929 RMSE: 0.052624 Entropy Value: 0.00028051806238002606
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.823913 |
| 5 | population | 0.128464 |
| 2 | extreme_poverty | 0.019683 |
| 1 | human_development_index | 0.016833 |
| 4 | population_density | 0.005945 |
| 3 | gdp_per_capita | 0.005162 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 86.060 | 4.77 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8377 | France | 1/25/2020 | 86.060 | 4.77 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8378 | France | 1/26/2020 | 86.060 | 4.77 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8379 | France | 1/27/2020 | 86.060 | 4.77 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8380 | France | 1/28/2020 | 86.060 | 4.77 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 439.415 | 10.08 | 40.2 | 76.00 | 17.366 | 41.2 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 439.415 | 10.08 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 439.415 | 10.08 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 439.415 | 10.08 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 439.415 | 10.08 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716205 |
2109 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9941440903988384
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.05493983728553627 R2 Score: 0.9952696565378223 RMSE: 0.234392 Entropy Value: 0.0017421697067628312
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.758670 |
| 0 | cardiovasc_death_rate | 0.166348 |
| 4 | aged_65_older | 0.025166 |
| 5 | median_age | 0.025088 |
| 2 | male_smokers | 0.013602 |
| 3 | life_expectancy | 0.011126 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 5.980 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8377 | France | 1/25/2020 | 5.980 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8378 | France | 1/26/2020 | 5.980 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8379 | France | 1/27/2020 | 5.980 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8380 | France | 1/28/2020 | 5.980 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716205 |
2109 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9930827422814454
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.07948072065736031 R2 Score: 0.9931566759949306 RMSE: 0.281923 Entropy Value: 0.0020226363731710244
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.622370 |
| 2 | extreme_poverty | 0.146551 |
| 0 | hospital_beds_per_thousand | 0.105804 |
| 5 | population | 0.055100 |
| 4 | population_density | 0.048866 |
| 3 | gdp_per_capita | 0.021310 |
# Country Pair by Pair Analysis relative to life expectancy
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on life expectancy (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_France = df[(df.location == "France")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Portugal = df[(df.location == "Portugal")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Spain = df[(df.location == "Spain")]
df_Sweden = df[(df.location == "Sweden")]
df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Czechia = df[(df.location == "Czechia")]
df_Estonia = df[(df.location == "Estonia")]
df_UnitedStates = df[(df.location == "United States")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Latvia = df[(df.location == "Latvia")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 18.571 | 41.8 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 18.571 | 41.8 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 18.571 | 41.8 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 18.571 | 41.8 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 18.571 | 41.8 | 0.711787 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9986706695832035
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01609753378306672 R2 Score: 0.998640384719372 RMSE: 0.126876 Entropy Value: 0.0006364314394538599
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.473470 |
| 1 | diabetes_prevalence | 0.471525 |
| 2 | female_smokers | 0.024737 |
| 5 | median_age | 0.021757 |
| 3 | male_smokers | 0.008255 |
| 4 | aged_65_older | 0.000256 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985453142557521
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01731215895223598 R2 Score: 0.9985377961513036 RMSE: 0.131576 Entropy Value: 0.000755209259742723
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.604442 |
| 5 | population | 0.227045 |
| 0 | hospital_beds_per_thousand | 0.118292 |
| 2 | extreme_poverty | 0.041327 |
| 3 | gdp_per_capita | 0.008604 |
| 4 | population_density | 0.000290 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Cyprus'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 16.984 | 41.4 | 1.093162 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.999053878094869
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002132122245038013 R2 Score: 0.9993731672242354 RMSE: 0.046175 Entropy Value: 0.00042462368096495684
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.472233 |
| 1 | diabetes_prevalence | 0.433225 |
| 5 | median_age | 0.076136 |
| 2 | female_smokers | 0.016640 |
| 3 | male_smokers | 0.001581 |
| 4 | aged_65_older | 0.000185 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Cyprus'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989605271130995
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002685282233557505 R2 Score: 0.999210541084082 RMSE: 0.051820 Entropy Value: 0.00034527877599041973
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.655426 |
| 1 | human_development_index | 0.198181 |
| 0 | hospital_beds_per_thousand | 0.115957 |
| 2 | extreme_poverty | 0.020517 |
| 4 | population_density | 0.006514 |
| 3 | gdp_per_capita | 0.003405 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 19.677 | 42.3 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 19.677 | 42.3 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 19.677 | 42.3 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 19.677 | 42.3 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 19.677 | 42.3 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 21.228 | 42.8 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 21.228 | 42.8 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 21.228 | 42.8 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 21.228 | 42.8 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 21.228 | 42.8 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.998779521821126
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008955815256083285 R2 Score: 0.9949195425294739 RMSE: 0.094635 Entropy Value: 0.0018700174545439696
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.825645 |
| 0 | cardiovasc_death_rate | 0.069251 |
| 2 | female_smokers | 0.051373 |
| 5 | median_age | 0.031775 |
| 3 | male_smokers | 0.020486 |
| 4 | aged_65_older | 0.001471 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984953654596765
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008136863131264145 R2 Score: 0.9953841179278682 RMSE: 0.090205 Entropy Value: 0.0015398143907589235
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.753936 |
| 5 | population | 0.092143 |
| 0 | hospital_beds_per_thousand | 0.070220 |
| 2 | extreme_poverty | 0.061815 |
| 3 | gdp_per_capita | 0.020882 |
| 4 | population_density | 0.001005 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 19.718 | 42.0 | 0.00000 |
| 8377 | France | 1/25/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 19.718 | 42.0 | 0.00000 |
| 8378 | France | 1/26/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 19.718 | 42.0 | 0.00000 |
| 8379 | France | 1/27/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 19.718 | 42.0 | 0.00000 |
| 8380 | France | 1/28/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 19.718 | 42.0 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 14.431 | 37.3 | 0.11011 |
2107 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9960173520783563
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.042620757311671066 R2 Score: 0.9966108339202479 RMSE: 0.206448 Entropy Value: 0.0015090598930850515
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.764287 |
| 0 | cardiovasc_death_rate | 0.178746 |
| 5 | median_age | 0.031670 |
| 3 | male_smokers | 0.009990 |
| 2 | female_smokers | 0.008057 |
| 4 | aged_65_older | 0.007250 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8377 | France | 1/25/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8378 | France | 1/26/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8379 | France | 1/27/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8380 | France | 1/28/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2107 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9946715554617531
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.06349754860870421 R2 Score: 0.9949507293753999 RMSE: 0.251987 Entropy Value: 0.002273841175109527
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.739557 |
| 0 | hospital_beds_per_thousand | 0.133697 |
| 4 | population_density | 0.060550 |
| 5 | population | 0.031155 |
| 3 | gdp_per_capita | 0.017913 |
| 2 | extreme_poverty | 0.017128 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 13.928 | 38.7 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 13.928 | 38.7 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 13.928 | 38.7 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 13.928 | 38.7 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 13.928 | 38.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 23.021 | 47.9 | 0.735109 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.999154237841806
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006427766546869428 R2 Score: 0.999471944421523 RMSE: 0.080173 Entropy Value: 0.00028870844431895304
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.844150 |
| 1 | diabetes_prevalence | 0.100753 |
| 0 | cardiovasc_death_rate | 0.043883 |
| 2 | female_smokers | 0.008767 |
| 3 | male_smokers | 0.002337 |
| 4 | aged_65_older | 0.000110 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988984971748192
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009889496543689754 R2 Score: 0.9991875554626719 RMSE: 0.099446 Entropy Value: 0.00044270709478587373
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.702587 |
| 0 | hospital_beds_per_thousand | 0.200820 |
| 5 | population | 0.042593 |
| 2 | extreme_poverty | 0.026734 |
| 3 | gdp_per_capita | 0.018469 |
| 4 | population_density | 0.008797 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 14.312 | 39.7 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 14.312 | 39.7 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 14.312 | 39.7 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 14.312 | 39.7 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 14.312 | 39.7 | 0.377872 |
2078 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989910619474511
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007086354331332736 R2 Score: 0.9990764156470773 RMSE: 0.084180 Entropy Value: 0.000702998195189773
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.685790 |
| 0 | cardiovasc_death_rate | 0.259016 |
| 5 | median_age | 0.038685 |
| 2 | female_smokers | 0.013791 |
| 3 | male_smokers | 0.002615 |
| 4 | aged_65_older | 0.000103 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
2078 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987243093178199
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007716113263742701 R2 Score: 0.9989943374064345 RMSE: 0.087841 Entropy Value: 0.0017161933932385992
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.482469 |
| 2 | extreme_poverty | 0.244199 |
| 0 | hospital_beds_per_thousand | 0.199908 |
| 5 | population | 0.057787 |
| 3 | gdp_per_capita | 0.014620 |
| 4 | population_density | 0.001017 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 19.062 | 44.5 | 0.536669 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985698342591484
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002959592405574864 R2 Score: 0.9985184915498152 RMSE: 0.054402 Entropy Value: 0.00030518164442021166
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.865331 |
| 0 | cardiovasc_death_rate | 0.085650 |
| 3 | male_smokers | 0.024057 |
| 2 | female_smokers | 0.020765 |
| 5 | median_age | 0.003633 |
| 4 | aged_65_older | 0.000564 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9987931112255966
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002719376915070921 R2 Score: 0.9986387382697272 RMSE: 0.052148 Entropy Value: 0.00030312420047472904
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.727041 |
| 5 | population | 0.146595 |
| 0 | hospital_beds_per_thousand | 0.068287 |
| 2 | extreme_poverty | 0.033873 |
| 3 | gdp_per_capita | 0.023341 |
| 4 | population_density | 0.000863 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Sweden'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 23011 | Sweden | 2/1/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 19.985 | 41.0 | 0.000000 |
| 23012 | Sweden | 2/2/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 19.985 | 41.0 | 0.000000 |
| 23013 | Sweden | 2/3/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 19.985 | 41.0 | 0.000000 |
| 23014 | Sweden | 2/4/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 19.985 | 41.0 | 0.000000 |
| 23015 | Sweden | 2/5/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 19.985 | 41.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 19.436 | 45.5 | 0.855148 |
2126 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.998867876669354
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009929112730334729 R2 Score: 0.9988353451448788 RMSE: 0.099645 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.943762 |
| 2 | female_smokers | 0.031796 |
| 0 | cardiovasc_death_rate | 0.012842 |
| 3 | male_smokers | 0.006060 |
| 5 | median_age | 0.005185 |
| 4 | aged_65_older | 0.000355 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Sweden'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 23011 | Sweden | 2/1/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23012 | Sweden | 2/2/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23013 | Sweden | 2/3/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23014 | Sweden | 2/4/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23015 | Sweden | 2/5/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
2126 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987460071894578
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.014248650470063625 R2 Score: 0.9983286764487842 RMSE: 0.119368 Entropy Value: 0.0007314147275536079
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.928746 |
| 2 | extreme_poverty | 0.039890 |
| 5 | population | 0.023575 |
| 3 | gdp_per_capita | 0.006665 |
| 0 | hospital_beds_per_thousand | 0.000593 |
| 4 | population_density | 0.000530 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United Kingdom'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 18.436 | 43.1 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 18.436 | 43.1 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 18.436 | 43.1 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 18.436 | 43.1 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 18.436 | 43.1 | 0.322149 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9551717291204419
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.14991967159078062 R2 Score: 0.9940801441598267 RMSE: 0.387195 Entropy Value: 0.0024482743045327773
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.481112 |
| 5 | median_age | 0.306261 |
| 1 | diabetes_prevalence | 0.069334 |
| 2 | female_smokers | 0.062658 |
| 4 | aged_65_older | 0.045872 |
| 3 | male_smokers | 0.034763 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United Kingdom'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322149 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.952952674424612
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.8303105819555134 R2 Score: 0.9672136491789828 RMSE: 0.911214 Entropy Value: 0.006240288209575662
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.314746 |
| 1 | human_development_index | 0.291846 |
| 2 | extreme_poverty | 0.138111 |
| 0 | hospital_beds_per_thousand | 0.116382 |
| 4 | population_density | 0.079420 |
| 3 | gdp_per_capita | 0.059494 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Estonia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 19.027 | 43.3 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 19.027 | 43.3 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 19.027 | 43.3 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 19.027 | 43.3 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 19.027 | 43.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 19.452 | 42.7 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 19.452 | 42.7 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 19.452 | 42.7 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 19.452 | 42.7 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 19.452 | 42.7 | 0.466423 |
2095 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988914645141207
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0009737929752882011 R2 Score: 0.998586037150187 RMSE: 0.031206 Entropy Value: 0.00030888828385945916
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.829546 |
| 5 | median_age | 0.073210 |
| 0 | cardiovasc_death_rate | 0.065060 |
| 2 | female_smokers | 0.021690 |
| 3 | male_smokers | 0.010329 |
| 4 | aged_65_older | 0.000165 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Estonia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
2095 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9982992559865769
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0013297376906791 R2 Score: 0.9980691997762051 RMSE: 0.036466 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.691546 |
| 0 | hospital_beds_per_thousand | 0.132334 |
| 5 | population | 0.127958 |
| 2 | extreme_poverty | 0.030192 |
| 3 | gdp_per_capita | 0.017370 |
| 4 | population_density | 0.000600 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United States'
country2 = 'Bulgaria'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 15.413 | 38.3 | 1.084791 |
2100 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9632592414289769
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007520814879905908 R2 Score: 0.9962170632858504 RMSE: 0.086723 Entropy Value: 0.0005526318445337031
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.332596 |
| 1 | diabetes_prevalence | 0.290038 |
| 5 | median_age | 0.260804 |
| 2 | female_smokers | 0.059976 |
| 4 | aged_65_older | 0.029853 |
| 3 | male_smokers | 0.026733 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United States'
country2 = 'Bulgaria'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.770 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.770 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.770 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.770 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.770 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2100 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9557094751056194
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01529064233782365 R2 Score: 0.9923088743432271 RMSE: 0.123655 Entropy Value: 0.0009932015791245545
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.355586 |
| 5 | population | 0.256295 |
| 1 | human_development_index | 0.226996 |
| 4 | population_density | 0.069507 |
| 2 | extreme_poverty | 0.056325 |
| 3 | gdp_per_capita | 0.035290 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Romania'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 17800 | Romania | 2/26/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 17.850 | 43.0 | 0.000000 |
| 17801 | Romania | 2/27/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 17.850 | 43.0 | 0.000000 |
| 17802 | Romania | 2/28/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 17.850 | 43.0 | 0.000000 |
| 17803 | Romania | 2/29/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 17.850 | 43.0 | 0.000000 |
| 17804 | Romania | 3/1/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 17.850 | 43.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 19.754 | 43.9 | 0.631969 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.998693558932106
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0019333636112641855 R2 Score: 0.9986831440091684 RMSE: 0.043970 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.488905 |
| 1 | diabetes_prevalence | 0.289292 |
| 5 | median_age | 0.199869 |
| 2 | female_smokers | 0.017471 |
| 3 | male_smokers | 0.003379 |
| 4 | aged_65_older | 0.001083 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Romania'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 17800 | Romania | 2/26/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17801 | Romania | 2/27/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17802 | Romania | 2/28/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17803 | Romania | 2/29/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17804 | Romania | 3/1/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9983819303279453
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0020986676904163337 R2 Score: 0.9985705518068159 RMSE: 0.045811 Entropy Value: 0.0002345345172898176
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.672578 |
| 1 | human_development_index | 0.206698 |
| 5 | population | 0.109832 |
| 2 | extreme_poverty | 0.010005 |
| 3 | gdp_per_capita | 0.000671 |
| 4 | population_density | 0.000216 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovakia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 17.366 | 41.2 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 17.366 | 41.2 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 17.366 | 41.2 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 17.366 | 41.2 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 17.366 | 41.2 | 0.716205 |
2067 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9958533450047872
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0008290251329329237 R2 Score: 0.996607231553571 RMSE: 0.028793 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.667632 |
| 0 | cardiovasc_death_rate | 0.144057 |
| 5 | median_age | 0.127156 |
| 2 | female_smokers | 0.026975 |
| 4 | aged_65_older | 0.021287 |
| 3 | male_smokers | 0.012893 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovakia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716205 |
2067 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9967966965293827
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0008619570126196017 R2 Score: 0.9964724585076836 RMSE: 0.029359 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.429013 |
| 0 | hospital_beds_per_thousand | 0.351790 |
| 5 | population | 0.185761 |
| 2 | extreme_poverty | 0.022991 |
| 3 | gdp_per_capita | 0.005310 |
| 4 | population_density | 0.005134 |
# Country Pair by Pair Analysis relative to aged_65_older
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on aged_65_older (13 pairs of countries)
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Finland = df[(df.location == "Finland")]
df_Italy = df[(df.location == "Italy")]
df_Portugal = df[(df.location == "Portugal")]
df_Sweden = df[(df.location == "Sweden")]
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Czechia = df[(df.location == "Czechia")]
df_Denmark = df[(df.location == "Denmark")]
df_Estonia = df[(df.location == "Estonia")]
df_France = df[(df.location == "France")]
df_Latvia = df[(df.location == "Latvia")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Spain = df[(df.location == "Spain")]
df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Slovakia = df[(df.location == "Slovakia")]
df_UnitedStates = df[(df.location == "United States")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Finland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 42.8 | 0.551590 |
| 8372 | Finland | 12/26/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 42.8 | 0.551590 |
| 8373 | Finland | 12/27/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 42.8 | 0.551590 |
| 8374 | Finland | 12/28/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 42.8 | 0.551590 |
| 8375 | Finland | 12/29/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 42.8 | 0.551590 |
2093 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9667879314777128
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0047703788248759415 R2 Score: 0.9980732819098217 RMSE: 0.069068 Entropy Value: 0.0006378391950056142
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.374058 |
| 0 | cardiovasc_death_rate | 0.360130 |
| 1 | diabetes_prevalence | 0.179878 |
| 3 | male_smokers | 0.033397 |
| 2 | female_smokers | 0.031643 |
| 4 | life_expectancy | 0.020894 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Finland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 3.280 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.551590 |
| 8372 | Finland | 12/26/2022 | 3.280 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.551590 |
| 8373 | Finland | 12/27/2022 | 3.280 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.551590 |
| 8374 | Finland | 12/28/2022 | 3.280 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.551590 |
| 8375 | Finland | 12/29/2022 | 3.280 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.551590 |
2093 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.965382393805797
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005470667380413319 R2 Score: 0.9977904409284591 RMSE: 0.073964 Entropy Value: 0.0007247508478906556
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.657479 |
| 5 | population | 0.175003 |
| 1 | human_development_index | 0.122471 |
| 2 | extreme_poverty | 0.022564 |
| 3 | gdp_per_capita | 0.011838 |
| 4 | population_density | 0.010644 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Portugal'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 47.9 | 0.735109 |
2098 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9993592153995753
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.020270373691906065 R2 Score: 0.9982206997530265 RMSE: 0.142374 Entropy Value: 0.0009509974446059138
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.614331 |
| 0 | cardiovasc_death_rate | 0.304136 |
| 2 | female_smokers | 0.042708 |
| 5 | median_age | 0.038273 |
| 3 | male_smokers | 0.000499 |
| 4 | life_expectancy | 0.000053 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Portugal'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2098 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9992988336861244
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.013594354001948517 R2 Score: 0.9988067098416261 RMSE: 0.116595 Entropy Value: 0.0007082932658725094
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.526697 |
| 1 | human_development_index | 0.421271 |
| 2 | extreme_poverty | 0.034415 |
| 0 | hospital_beds_per_thousand | 0.010278 |
| 3 | gdp_per_capita | 0.007127 |
| 4 | population_density | 0.000212 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Austria'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 41.0 | 0.816005 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9986828973635486
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01245250411538312 R2 Score: 0.9973372856257704 RMSE: 0.111591 Entropy Value: 0.0006474195228361843
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.397694 |
| 5 | median_age | 0.349627 |
| 1 | diabetes_prevalence | 0.226924 |
| 2 | female_smokers | 0.017465 |
| 4 | life_expectancy | 0.004970 |
| 3 | male_smokers | 0.003319 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Austria'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9984810446105193
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.014427701442901309 R2 Score: 0.9969149299078208 RMSE: 0.120115 Entropy Value: 0.000925590220887666
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.475411 |
| 0 | hospital_beds_per_thousand | 0.224173 |
| 2 | extreme_poverty | 0.198854 |
| 5 | population | 0.080118 |
| 3 | gdp_per_capita | 0.020331 |
| 4 | population_density | 0.001113 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Canada'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 41.8 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 41.8 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 41.8 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 41.8 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 41.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 41.4 | 1.093162 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9990497455346172
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.016727148283166463 R2 Score: 0.9987351396841938 RMSE: 0.129333 Entropy Value: 0.0004295927143185571
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.908673 |
| 0 | cardiovasc_death_rate | 0.053697 |
| 5 | median_age | 0.021604 |
| 2 | female_smokers | 0.012787 |
| 3 | male_smokers | 0.003085 |
| 4 | life_expectancy | 0.000153 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Canada'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9982635754420282
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.021593332086563184 R2 Score: 0.9983671724325057 RMSE: 0.146947 Entropy Value: 0.0011894401573703628
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.854404 |
| 0 | hospital_beds_per_thousand | 0.055223 |
| 2 | extreme_poverty | 0.054972 |
| 5 | population | 0.017190 |
| 3 | gdp_per_capita | 0.014188 |
| 4 | population_density | 0.004023 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 43.3 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 43.3 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 43.3 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 43.3 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 43.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 42.3 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 42.3 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 42.3 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 42.3 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 42.3 | 0.229131 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990998156820339
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0022554314390671685 R2 Score: 0.9981274828812655 RMSE: 0.047491 Entropy Value: 0.00039819174563969235
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.847823 |
| 5 | median_age | 0.105723 |
| 0 | cardiovasc_death_rate | 0.030798 |
| 2 | female_smokers | 0.014866 |
| 3 | male_smokers | 0.000634 |
| 4 | life_expectancy | 0.000156 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.229131 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990802017690633
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0018119703339651765 R2 Score: 0.9984956556824479 RMSE: 0.042567 Entropy Value: 0.0003609256883659556
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.672643 |
| 0 | hospital_beds_per_thousand | 0.166905 |
| 5 | population | 0.134752 |
| 2 | extreme_poverty | 0.022526 |
| 3 | gdp_per_capita | 0.002956 |
| 4 | population_density | 0.000219 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'France'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 42.7 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 42.7 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 42.7 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 42.7 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 42.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 42.0 | 0.411710 |
| 9443 | France | 12/26/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 42.0 | 0.411282 |
| 9444 | France | 12/27/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 42.0 | 0.411730 |
| 9445 | France | 12/28/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 42.0 | 0.411813 |
| 9446 | France | 12/29/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 42.0 | 0.411892 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9976163185744944
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0627316342415563 R2 Score: 0.9934687538779688 RMSE: 0.250463 Entropy Value: 0.0031520180501137213
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.613007 |
| 0 | cardiovasc_death_rate | 0.318802 |
| 5 | median_age | 0.047053 |
| 2 | female_smokers | 0.012094 |
| 3 | male_smokers | 0.008496 |
| 4 | life_expectancy | 0.000548 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'France'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411710 |
| 9443 | France | 12/26/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411282 |
| 9444 | France | 12/27/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411730 |
| 9445 | France | 12/28/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411813 |
| 9446 | France | 12/29/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411892 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9978195582971618
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.07504882607957856 R2 Score: 0.9921863608333906 RMSE: 0.273950 Entropy Value: 0.004313799679754843
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.760584 |
| 5 | population | 0.117334 |
| 0 | hospital_beds_per_thousand | 0.096855 |
| 2 | extreme_poverty | 0.013718 |
| 3 | gdp_per_capita | 0.011203 |
| 4 | population_density | 0.000306 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Netherlands'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 43.9 | 0.631969 |
2075 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990018484824634
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005596727727292748 R2 Score: 0.99924811770952 RMSE: 0.074811 Entropy Value: 0.0003576869080799259
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.832180 |
| 0 | cardiovasc_death_rate | 0.135251 |
| 2 | female_smokers | 0.017563 |
| 5 | median_age | 0.014338 |
| 3 | male_smokers | 0.000591 |
| 4 | life_expectancy | 0.000076 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Netherlands'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2075 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988456921162413
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005570458953609194 R2 Score: 0.9992516467405338 RMSE: 0.074636 Entropy Value: 0.0006602651523713767
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.646902 |
| 0 | hospital_beds_per_thousand | 0.161196 |
| 2 | extreme_poverty | 0.124314 |
| 5 | population | 0.045094 |
| 3 | gdp_per_capita | 0.019807 |
| 4 | population_density | 0.002686 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 41.2 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 41.2 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 41.2 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 41.2 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 43.0 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9983847769546339
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0017588024130940267 R2 Score: 0.9989760104365151 RMSE: 0.041938 Entropy Value: 0.00044469540288006774
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.625635 |
| 0 | cardiovasc_death_rate | 0.244771 |
| 1 | diabetes_prevalence | 0.109839 |
| 2 | female_smokers | 0.010393 |
| 3 | male_smokers | 0.008904 |
| 4 | life_expectancy | 0.000458 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979474526774726
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002076185405706244 R2 Score: 0.9987912273877525 RMSE: 0.045565 Entropy Value: 0.00045853981129780964
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.663901 |
| 0 | hospital_beds_per_thousand | 0.186232 |
| 1 | human_development_index | 0.124847 |
| 2 | extreme_poverty | 0.013139 |
| 3 | gdp_per_capita | 0.011260 |
| 4 | population_density | 0.000621 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovenia'
country2 = 'Spain'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 45.5 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 45.5 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 45.5 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 45.5 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 45.5 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 44.5 | 0.536669 |
2125 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987609289639716
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007138550334511055 R2 Score: 0.9989051784209579 RMSE: 0.084490 Entropy Value: 0.0006678903177231346
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.819366 |
| 0 | cardiovasc_death_rate | 0.131678 |
| 2 | female_smokers | 0.037593 |
| 5 | median_age | 0.009186 |
| 3 | male_smokers | 0.001930 |
| 4 | life_expectancy | 0.000246 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovenia'
country2 = 'Spain'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 2.97 | 0.904 | 1.0 | 34272.36 | 93.105 | 47558632 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 2.97 | 0.904 | 1.0 | 34272.36 | 93.105 | 47558632 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 2.97 | 0.904 | 1.0 | 34272.36 | 93.105 | 47558632 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 2.97 | 0.904 | 1.0 | 34272.36 | 93.105 | 47558632 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 2.97 | 0.904 | 1.0 | 34272.36 | 93.105 | 47558632 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.84 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.84 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.84 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.84 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.84 | 102.619 | 2119843 | 0.536669 |
2125 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9976196824477601
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009459864855929317 R2 Score: 0.9985491642289018 RMSE: 0.097262 Entropy Value: 0.0010173017666235732
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.731289 |
| 2 | extreme_poverty | 0.139555 |
| 0 | hospital_beds_per_thousand | 0.068121 |
| 5 | population | 0.044294 |
| 3 | gdp_per_capita | 0.014539 |
| 4 | population_density | 0.002202 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United Kingdom'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 43.1 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 43.1 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 43.1 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 43.1 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 43.1 | 0.322149 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9510892546005006
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 1.7690674535543163 R2 Score: 0.93014509580057 RMSE: 1.330063 Entropy Value: 0.007763981883565692
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.452751 |
| 1 | diabetes_prevalence | 0.267413 |
| 2 | female_smokers | 0.133603 |
| 4 | life_expectancy | 0.081354 |
| 5 | median_age | 0.041533 |
| 3 | male_smokers | 0.023346 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United Kingdom'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322149 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.952952674424612
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.8303105819555134 R2 Score: 0.9672136491789828 RMSE: 0.911214 Entropy Value: 0.006240288209575662
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.314746 |
| 1 | human_development_index | 0.291846 |
| 2 | extreme_poverty | 0.138111 |
| 0 | hospital_beds_per_thousand | 0.116382 |
| 4 | population_density | 0.079420 |
| 3 | gdp_per_capita | 0.059494 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 37.3 | 0.00000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 37.3 | 0.00000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 37.3 | 0.00000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 37.3 | 0.00000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 37.3 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 37.3 | 0.11011 |
2063 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9884723572153133
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0004166879516503694 R2 Score: 0.9978755777516257 RMSE: 0.020413 Entropy Value: 0.0007951400229065907
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.625744 |
| 0 | cardiovasc_death_rate | 0.234341 |
| 5 | median_age | 0.057504 |
| 3 | male_smokers | 0.032153 |
| 2 | female_smokers | 0.027268 |
| 4 | life_expectancy | 0.022991 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2063 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.988544640680141
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0009590270117916058 R2 Score: 0.9951105418033505 RMSE: 0.030968 Entropy Value: 0.0017192056737414868
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.542660 |
| 0 | hospital_beds_per_thousand | 0.171023 |
| 2 | extreme_poverty | 0.106136 |
| 3 | gdp_per_capita | 0.073260 |
| 4 | population_density | 0.055552 |
| 5 | population | 0.051370 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 39.7 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 39.7 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 39.7 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 39.7 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 39.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 38.7 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 38.7 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 38.7 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 38.7 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 38.7 | 0.491388 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979194282791959
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002869722959012224 R2 Score: 0.9987432806147093 RMSE: 0.053570 Entropy Value: 0.0005898323283403529
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.668046 |
| 1 | diabetes_prevalence | 0.137005 |
| 3 | male_smokers | 0.069868 |
| 2 | female_smokers | 0.062073 |
| 5 | median_age | 0.061625 |
| 4 | life_expectancy | 0.001384 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9986051830769147
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0015122598797065847 R2 Score: 0.9993377457219498 RMSE: 0.038888 Entropy Value: 0.0003022118010805689
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.715384 |
| 1 | human_development_index | 0.205967 |
| 0 | hospital_beds_per_thousand | 0.052834 |
| 2 | extreme_poverty | 0.024062 |
| 3 | gdp_per_capita | 0.001458 |
| 4 | population_density | 0.000295 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 38.3 | 1.084791 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9878328845258098
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00894710610093598 R2 Score: 0.9915900687687879 RMSE: 0.094589 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.468440 |
| 0 | cardiovasc_death_rate | 0.277746 |
| 5 | median_age | 0.202056 |
| 2 | female_smokers | 0.031399 |
| 4 | life_expectancy | 0.011402 |
| 3 | male_smokers | 0.008956 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.988734157660151
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0038799006752455517 R2 Score: 0.9963530444934217 RMSE: 0.062289 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.383000 |
| 1 | human_development_index | 0.344298 |
| 0 | hospital_beds_per_thousand | 0.134705 |
| 3 | gdp_per_capita | 0.067241 |
| 2 | extreme_poverty | 0.057677 |
| 4 | population_density | 0.013079 |
# Country Pair by Pair Analysis relative to diabetes prevalence
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on diabetes prevalence (13 pairs of countries)
df_Belgium = df[(df.location == "Belgium")]
df_Estonia = df[(df.location == "Estonia")]
df_France = df[(df.location == "France")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]
df_Latvia = df[(df.location == "Latvia")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Austria = df[(df.location == "Austria")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Czechia = df[(df.location == "Czechia")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_Switzerland = df[(df.location == "Switzerland")]
df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Portugal = df[(df.location == "Portugal")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Spain = df[(df.location == "Spain")]
df_UnitedStates = df[(df.location == "United States")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Estonia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 255.569 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 255.569 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 255.569 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 255.569 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 255.569 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
2121 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9989949619386504
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.012575343517403465 R2 Score: 0.9990037276216706 RMSE: 0.112140 Entropy Value: 0.0005772386498847016
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.786635 |
| 0 | cardiovasc_death_rate | 0.176158 |
| 2 | male_smokers | 0.024730 |
| 5 | median_age | 0.010989 |
| 3 | life_expectancy | 0.001396 |
| 4 | aged_65_older | 0.000091 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Estonia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
2121 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985104636986213
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.012715196293674761 R2 Score: 0.9989926478879171 RMSE: 0.112762 Entropy Value: 0.0008232624023628059
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.744249 |
| 0 | hospital_beds_per_thousand | 0.102687 |
| 2 | extreme_poverty | 0.100773 |
| 5 | population | 0.041484 |
| 3 | gdp_per_capita | 0.010488 |
| 4 | population_density | 0.000319 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 86.060 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8377 | France | 1/25/2020 | 86.060 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8378 | France | 1/26/2020 | 86.060 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8379 | France | 1/27/2020 | 86.060 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8380 | France | 1/28/2020 | 86.060 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
2107 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9952822274580472
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.04901676742466076 R2 Score: 0.9961022286797971 RMSE: 0.221397 Entropy Value: 0.002171781112379247
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.806289 |
| 0 | cardiovasc_death_rate | 0.144819 |
| 5 | median_age | 0.021652 |
| 2 | male_smokers | 0.010381 |
| 3 | life_expectancy | 0.008614 |
| 4 | aged_65_older | 0.008244 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8377 | France | 1/25/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8378 | France | 1/26/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8379 | France | 1/27/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8380 | France | 1/28/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2107 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9946715554617531
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.06349754860870421 R2 Score: 0.9949507293753999 RMSE: 0.251987 Entropy Value: 0.002273841175109527
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.739557 |
| 0 | hospital_beds_per_thousand | 0.133697 |
| 4 | population_density | 0.060550 |
| 5 | population | 0.031155 |
| 3 | gdp_per_capita | 0.017913 |
| 2 | extreme_poverty | 0.017128 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 126.459 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 126.459 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 126.459 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 126.459 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 126.459 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990989661235357
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0062496276503529605 R2 Score: 0.9994865789352944 RMSE: 0.079055 Entropy Value: 0.00034870367258360715
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.436515 |
| 0 | cardiovasc_death_rate | 0.419361 |
| 5 | median_age | 0.110731 |
| 2 | male_smokers | 0.031872 |
| 3 | life_expectancy | 0.000899 |
| 4 | aged_65_older | 0.000622 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988984971748192
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009889496543689754 R2 Score: 0.9991875554626719 RMSE: 0.099446 Entropy Value: 0.00044270709478587373
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.702587 |
| 0 | hospital_beds_per_thousand | 0.200820 |
| 5 | population | 0.042593 |
| 2 | extreme_poverty | 0.026734 |
| 3 | gdp_per_capita | 0.018469 |
| 4 | population_density | 0.008797 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 128.275 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 128.275 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 128.275 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 128.275 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 128.275 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2079 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9981149623768651
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0022798086212839955 R2 Score: 0.9942594043941946 RMSE: 0.047747 Entropy Value: 0.0007392045794560473
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.846222 |
| 0 | cardiovasc_death_rate | 0.084867 |
| 5 | median_age | 0.044414 |
| 2 | male_smokers | 0.020585 |
| 3 | life_expectancy | 0.002959 |
| 4 | aged_65_older | 0.000952 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2079 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9980463202050925
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0019845762199695097 R2 Score: 0.995002804436573 RMSE: 0.044549 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.485630 |
| 5 | population | 0.260661 |
| 0 | hospital_beds_per_thousand | 0.204685 |
| 2 | extreme_poverty | 0.042192 |
| 3 | gdp_per_capita | 0.005108 |
| 4 | population_density | 0.001724 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Sweden'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.816005 |
2100 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991742414438469
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.011924917352362436 R2 Score: 0.9988595164779729 RMSE: 0.109201 Entropy Value: 0.0006866784019283522
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.737708 |
| 2 | male_smokers | 0.199388 |
| 0 | cardiovasc_death_rate | 0.035390 |
| 3 | life_expectancy | 0.017898 |
| 5 | median_age | 0.007344 |
| 4 | aged_65_older | 0.002271 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Sweden'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2100 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9992006206155605
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.015468257535200518 R2 Score: 0.9985206360503729 RMSE: 0.124371 Entropy Value: 0.0009440623489042599
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.669177 |
| 2 | extreme_poverty | 0.171464 |
| 0 | hospital_beds_per_thousand | 0.112389 |
| 5 | population | 0.024007 |
| 3 | gdp_per_capita | 0.019415 |
| 4 | population_density | 0.003549 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'Austria'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13605 | United Kingdom | 12/25/2022 | 122.137 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13606 | United Kingdom | 12/26/2022 | 122.137 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13607 | United Kingdom | 12/27/2022 | 122.137 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13608 | United Kingdom | 12/28/2022 | 122.137 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13609 | United Kingdom | 12/29/2022 | 122.137 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.982683505807579
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 5.700044545743729 R2 Score: 0.8519894967019457 RMSE: 2.387477 Entropy Value: 0.01428821441905026
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.575495 |
| 0 | cardiovasc_death_rate | 0.331887 |
| 1 | female_smokers | 0.046679 |
| 2 | male_smokers | 0.020746 |
| 4 | aged_65_older | 0.017492 |
| 3 | life_expectancy | 0.007702 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'Austria'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13605 | United Kingdom | 12/25/2022 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13606 | United Kingdom | 12/26/2022 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13607 | United Kingdom | 12/27/2022 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13608 | United Kingdom | 12/28/2022 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13609 | United Kingdom | 12/29/2022 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 0.883564 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9858107613099282
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 5.255425341034955 R2 Score: 0.8635347244167126 RMSE: 2.292471 Entropy Value: 0.012939782263897097
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.357251 |
| 1 | human_development_index | 0.314325 |
| 0 | hospital_beds_per_thousand | 0.158735 |
| 2 | extreme_poverty | 0.102932 |
| 4 | population_density | 0.044483 |
| 3 | gdp_per_capita | 0.022273 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Czechia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 227.485 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 227.485 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 227.485 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 227.485 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 227.485 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919575 |
2061 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9590037445837588
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002678735178965485 R2 Score: 0.9983661769962296 RMSE: 0.051756 Entropy Value: 0.00028650102618085413
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.480463 |
| 5 | median_age | 0.405327 |
| 2 | male_smokers | 0.053942 |
| 1 | female_smokers | 0.024489 |
| 4 | aged_65_older | 0.021659 |
| 3 | life_expectancy | 0.014120 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Czechia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919575 |
2061 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9555145568773449
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002769256603078144 R2 Score: 0.9983109658703929 RMSE: 0.052624 Entropy Value: 0.00028051806238002606
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.823913 |
| 5 | population | 0.128464 |
| 2 | extreme_poverty | 0.019683 |
| 1 | human_development_index | 0.016833 |
| 4 | population_density | 0.005945 |
| 3 | gdp_per_capita | 0.005162 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 153.507 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 153.507 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 153.507 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 153.507 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 153.507 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987760862696191
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008754055895476695 R2 Score: 0.9950339966379534 RMSE: 0.093563 Entropy Value: 0.0017693996942911307
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.821181 |
| 0 | cardiovasc_death_rate | 0.071899 |
| 2 | male_smokers | 0.051741 |
| 5 | median_age | 0.027919 |
| 3 | life_expectancy | 0.025571 |
| 4 | aged_65_older | 0.001689 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984953654596765
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008136863131264145 R2 Score: 0.9953841179278682 RMSE: 0.090205 Entropy Value: 0.0015398143907589235
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.753936 |
| 5 | population | 0.092143 |
| 0 | hospital_beds_per_thousand | 0.070220 |
| 2 | extreme_poverty | 0.061815 |
| 3 | gdp_per_capita | 0.020882 |
| 4 | population_density | 0.001005 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'Canada'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 99.739 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 99.739 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 99.739 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 99.739 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 99.739 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2111 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.999066309456236
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010540142156221432 R2 Score: 0.9967890951053929 RMSE: 0.102665 Entropy Value: 0.0015411517323291324
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.724506 |
| 0 | cardiovasc_death_rate | 0.158352 |
| 5 | median_age | 0.050841 |
| 3 | life_expectancy | 0.034126 |
| 2 | male_smokers | 0.029129 |
| 4 | aged_65_older | 0.003046 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'Canada'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.50 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.50 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.50 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.50 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.50 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2111 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989799879827812
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008411506866870231 R2 Score: 0.9974375536715211 RMSE: 0.091714 Entropy Value: 0.001265725840506202
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.526054 |
| 5 | population | 0.306753 |
| 0 | hospital_beds_per_thousand | 0.071260 |
| 3 | gdp_per_capita | 0.048732 |
| 2 | extreme_poverty | 0.043372 |
| 4 | population_density | 0.003829 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Portugal'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11513 | Portugal | 12/25/2022 | 127.842 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11514 | Portugal | 12/26/2022 | 127.842 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11515 | Portugal | 12/27/2022 | 127.842 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11516 | Portugal | 12/28/2022 | 127.842 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11517 | Portugal | 12/29/2022 | 127.842 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
2061 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.997717565913917
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0011053104561160806 R2 Score: 0.9988694272398144 RMSE: 0.033246 Entropy Value: 0.0004480598968395122
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.437605 |
| 0 | cardiovasc_death_rate | 0.295545 |
| 5 | median_age | 0.225797 |
| 2 | male_smokers | 0.028535 |
| 3 | life_expectancy | 0.008015 |
| 4 | aged_65_older | 0.004504 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Portugal'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11513 | Portugal | 12/25/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11514 | Portugal | 12/26/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11515 | Portugal | 12/27/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11516 | Portugal | 12/28/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11517 | Portugal | 12/29/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 112.371 | 10270857 | 0.462977 |
2061 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9972746167510301
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0011288091118561118 R2 Score: 0.9988453915130793 RMSE: 0.033598 Entropy Value: 0.00046678790937134003
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.410411 |
| 5 | population | 0.387691 |
| 0 | hospital_beds_per_thousand | 0.168443 |
| 2 | extreme_poverty | 0.027509 |
| 3 | gdp_per_capita | 0.003726 |
| 4 | population_density | 0.002220 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 439.415 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 439.415 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 439.415 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 439.415 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 439.415 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.998541039563188
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.001587330310482497 R2 Score: 0.9990758429374236 RMSE: 0.039841 Entropy Value: 0.0004022874798044647
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.538776 |
| 5 | median_age | 0.265115 |
| 1 | female_smokers | 0.170741 |
| 2 | male_smokers | 0.017414 |
| 3 | life_expectancy | 0.007693 |
| 4 | aged_65_older | 0.000261 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979474526774726
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002076185405706244 R2 Score: 0.9987912273877525 RMSE: 0.045565 Entropy Value: 0.00045853981129780964
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.663901 |
| 0 | hospital_beds_per_thousand | 0.186232 |
| 1 | human_development_index | 0.124847 |
| 2 | extreme_poverty | 0.013139 |
| 3 | gdp_per_capita | 0.011260 |
| 4 | population_density | 0.000621 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2091 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9982988581909075
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004058159026547636 R2 Score: 0.9980089836798032 RMSE: 0.063704 Entropy Value: 0.0005571201418958172
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.740876 |
| 0 | cardiovasc_death_rate | 0.172804 |
| 5 | median_age | 0.065567 |
| 2 | male_smokers | 0.015433 |
| 4 | aged_65_older | 0.004372 |
| 3 | life_expectancy | 0.000948 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2091 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979685572320796
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010286891805743988 R2 Score: 0.994953039214247 RMSE: 0.101424 Entropy Value: 0.001005217028297138
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.658730 |
| 5 | population | 0.288222 |
| 2 | extreme_poverty | 0.024648 |
| 0 | hospital_beds_per_thousand | 0.015281 |
| 3 | gdp_per_capita | 0.007035 |
| 4 | population_density | 0.006085 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 99.403 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 99.403 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 99.403 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 99.403 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 99.403 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2136 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9980806947075394
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.04804061406111574 R2 Score: 0.9919137103360092 RMSE: 0.219182 Entropy Value: 0.0016530566772748438
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.728742 |
| 0 | cardiovasc_death_rate | 0.180431 |
| 5 | median_age | 0.065855 |
| 2 | male_smokers | 0.014653 |
| 3 | life_expectancy | 0.010040 |
| 4 | aged_65_older | 0.000279 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2136 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9969133378057181
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.06838014177054151 R2 Score: 0.9884901214435369 RMSE: 0.261496 Entropy Value: 0.002529999431244677
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.552532 |
| 0 | hospital_beds_per_thousand | 0.179562 |
| 4 | population_density | 0.124521 |
| 5 | population | 0.053568 |
| 2 | extreme_poverty | 0.048355 |
| 3 | gdp_per_capita | 0.041462 |
# Country Pair by Pair Analysis relative to median age
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on median age (13 pairs of countries)
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Italy = df[(df.location == "Italy")]
df_Portugal = df[(df.location == "Portugal")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Spain = df[(df.location == "Spain")]
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Czechia = df[(df.location == "Czechia")]
df_Denmark = df[(df.location == "Denmark")]
df_Estonia = df[(df.location == "Estonia")]
df_Finland = df[(df.location == "Finland")]
df_France = df[(df.location == "France")]
df_Latvia = df[(df.location == "Latvia")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Sweden = df[(df.location == "Sweden")]
df_Switzerland = df[(df.location == "Switzerland")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 0.735109 |
2091 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9904546220941783
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01619619055558157 R2 Score: 0.9984776146320379 RMSE: 0.127264 Entropy Value: 0.0003811524578491067
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | aged_65_older | 0.511189 |
| 1 | diabetes_prevalence | 0.460899 |
| 2 | female_smokers | 0.013441 |
| 0 | cardiovasc_death_rate | 0.008192 |
| 4 | life_expectancy | 0.004344 |
| 3 | male_smokers | 0.001936 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.180 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.180 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.180 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.180 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.180 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2091 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9922672063895973
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.015206168226033538 R2 Score: 0.998570673274642 RMSE: 0.123313 Entropy Value: 0.0005836511271442241
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.643511 |
| 5 | population | 0.194207 |
| 0 | hospital_beds_per_thousand | 0.129965 |
| 2 | extreme_poverty | 0.022751 |
| 4 | population_density | 0.006924 |
| 3 | gdp_per_capita | 0.002641 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 0.536669 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985113247826553
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002853509898203111 R2 Score: 0.9985715941766472 RMSE: 0.053418 Entropy Value: 0.00029581235478226195
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.861019 |
| 0 | cardiovasc_death_rate | 0.088271 |
| 3 | male_smokers | 0.020801 |
| 2 | female_smokers | 0.020747 |
| 5 | aged_65_older | 0.008471 |
| 4 | life_expectancy | 0.000691 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9987931112255966
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002719376915070921 R2 Score: 0.9986387382697272 RMSE: 0.052148 Entropy Value: 0.00030312420047472904
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.727041 |
| 5 | population | 0.146595 |
| 0 | hospital_beds_per_thousand | 0.068287 |
| 2 | extreme_poverty | 0.033873 |
| 3 | gdp_per_capita | 0.023341 |
| 4 | population_density | 0.000863 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Austria'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 0.855148 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.999159213416352
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006012348134524364 R2 Score: 0.9989310590870992 RMSE: 0.077539 Entropy Value: 0.00034449028141466515
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.471275 |
| 0 | cardiovasc_death_rate | 0.425063 |
| 5 | aged_65_older | 0.059626 |
| 2 | female_smokers | 0.033989 |
| 3 | male_smokers | 0.009843 |
| 4 | life_expectancy | 0.000204 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Austria'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.998690899271246
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006240496359956682 R2 Score: 0.9988904964039489 RMSE: 0.078997 Entropy Value: 0.0005445524704743854
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.524350 |
| 5 | population | 0.363217 |
| 0 | hospital_beds_per_thousand | 0.075626 |
| 2 | extreme_poverty | 0.027055 |
| 3 | gdp_per_capita | 0.009581 |
| 4 | population_density | 0.000171 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Canada'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 1.093162 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990745539142948
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.014310712890414518 R2 Score: 0.9989178637912717 RMSE: 0.119627 Entropy Value: 0.00033461301188507007
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.906030 |
| 0 | cardiovasc_death_rate | 0.054128 |
| 5 | aged_65_older | 0.022543 |
| 2 | female_smokers | 0.014168 |
| 3 | male_smokers | 0.003060 |
| 4 | life_expectancy | 0.000070 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Canada'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9982635754420282
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.021593332086563184 R2 Score: 0.9983671724325057 RMSE: 0.146947 Entropy Value: 0.0011894401573703628
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.854404 |
| 0 | hospital_beds_per_thousand | 0.055223 |
| 2 | extreme_poverty | 0.054972 |
| 5 | population | 0.017190 |
| 3 | gdp_per_capita | 0.014188 |
| 4 | population_density | 0.004023 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 0.229131 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991757801493086
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0026568878338268037 R2 Score: 0.9977941834696354 RMSE: 0.051545 Entropy Value: 0.000440953658890907
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.861674 |
| 5 | aged_65_older | 0.091122 |
| 0 | cardiovasc_death_rate | 0.032199 |
| 2 | female_smokers | 0.014190 |
| 3 | male_smokers | 0.000679 |
| 4 | life_expectancy | 0.000135 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.229131 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990802017690633
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0018119703339651765 R2 Score: 0.9984956556824479 RMSE: 0.042567 Entropy Value: 0.0003609256883659556
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.672643 |
| 0 | hospital_beds_per_thousand | 0.166905 |
| 5 | population | 0.134752 |
| 2 | extreme_poverty | 0.022526 |
| 3 | gdp_per_capita | 0.002956 |
| 4 | population_density | 0.000219 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Finland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 0.00000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 0.00000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 0.00000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 0.00000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 0.55159 |
2127 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9971001469654569
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00293040781405009 R2 Score: 0.9973865357349957 RMSE: 0.054133 Entropy Value: 0.0007814952239726046
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.877787 |
| 5 | aged_65_older | 0.056560 |
| 0 | cardiovasc_death_rate | 0.031452 |
| 2 | female_smokers | 0.018166 |
| 3 | male_smokers | 0.011544 |
| 4 | life_expectancy | 0.004491 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Finland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
2127 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9973522032432347
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002073800874034314 R2 Score: 0.9981504948044987 RMSE: 0.045539 Entropy Value: 0.0008383799779777811
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.825465 |
| 5 | population | 0.069406 |
| 0 | hospital_beds_per_thousand | 0.057109 |
| 2 | extreme_poverty | 0.029447 |
| 3 | gdp_per_capita | 0.014664 |
| 4 | population_density | 0.003909 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 86.06 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 0.000000 |
| 8377 | France | 1/25/2020 | 86.06 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 0.000000 |
| 8378 | France | 1/26/2020 | 86.06 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 0.000000 |
| 8379 | France | 1/27/2020 | 86.06 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 0.000000 |
| 8380 | France | 1/28/2020 | 86.06 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.06 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.06 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.06 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.06 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.06 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 0.631969 |
2109 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9943677613139872
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.04041345514758587 R2 Score: 0.9964586270192674 RMSE: 0.201031 Entropy Value: 0.0012288161708932492
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.730534 |
| 0 | cardiovasc_death_rate | 0.181573 |
| 5 | aged_65_older | 0.041538 |
| 2 | female_smokers | 0.030633 |
| 3 | male_smokers | 0.011895 |
| 4 | life_expectancy | 0.003826 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8377 | France | 1/25/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8378 | France | 1/26/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8379 | France | 1/27/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8380 | France | 1/28/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2109 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.993812574670508
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.04872995253687114 R2 Score: 0.9957298643079082 RMSE: 0.220749 Entropy Value: 0.0019759927941236785
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.867631 |
| 0 | hospital_beds_per_thousand | 0.063217 |
| 3 | gdp_per_capita | 0.022920 |
| 2 | extreme_poverty | 0.021983 |
| 5 | population | 0.018837 |
| 4 | population_density | 0.005412 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Romania'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 2.036403 |
2075 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9993100745193948
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0036117981905216217 R2 Score: 0.99953160146208 RMSE: 0.060098 Entropy Value: 0.00019551055515040766
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.746227 |
| 5 | aged_65_older | 0.150810 |
| 0 | cardiovasc_death_rate | 0.090035 |
| 2 | female_smokers | 0.012561 |
| 3 | male_smokers | 0.000259 |
| 4 | life_expectancy | 0.000107 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Romania'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.320 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.320 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.320 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.320 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.320 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2075 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9992177391087653
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0043282460748784585 R2 Score: 0.9994386884243556 RMSE: 0.065789 Entropy Value: 0.00015036708477313637
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.457357 |
| 1 | human_development_index | 0.431411 |
| 5 | population | 0.098481 |
| 2 | extreme_poverty | 0.011752 |
| 3 | gdp_per_capita | 0.000915 |
| 4 | population_density | 0.000084 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovakia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 0.716205 |
2067 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9960436315500438
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0007537855022416325 R2 Score: 0.9969151482074694 RMSE: 0.027455 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.603285 |
| 0 | cardiovasc_death_rate | 0.222175 |
| 5 | aged_65_older | 0.118124 |
| 2 | female_smokers | 0.025456 |
| 4 | life_expectancy | 0.023034 |
| 3 | male_smokers | 0.007925 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovakia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716205 |
2067 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9967966965293827
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0008619570126196017 R2 Score: 0.9964724585076836 RMSE: 0.029359 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.429013 |
| 0 | hospital_beds_per_thousand | 0.351790 |
| 5 | population | 0.185761 |
| 2 | extreme_poverty | 0.022991 |
| 3 | gdp_per_capita | 0.005310 |
| 4 | population_density | 0.005134 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Switzerland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 0.816005 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9986502721479346
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.012007379222393649 R2 Score: 0.997706010036352 RMSE: 0.109578 Entropy Value: 0.0007943736902363641
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.552534 |
| 0 | cardiovasc_death_rate | 0.197404 |
| 5 | aged_65_older | 0.146890 |
| 3 | male_smokers | 0.064874 |
| 2 | female_smokers | 0.037203 |
| 4 | life_expectancy | 0.001096 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Switzerland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985258910599774
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009780822718122402 R2 Score: 0.9981313899781104 RMSE: 0.098898 Entropy Value: 0.0007038834783452404
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.543966 |
| 5 | population | 0.270168 |
| 0 | hospital_beds_per_thousand | 0.096949 |
| 3 | gdp_per_capita | 0.051447 |
| 2 | extreme_poverty | 0.037239 |
| 4 | population_density | 0.000231 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 0.00000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 0.00000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 0.00000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 0.00000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 0.11011 |
2063 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.988459659933665
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0004198357264663966 R2 Score: 0.9978595292846001 RMSE: 0.020490 Entropy Value: 0.0008321936994172394
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.622458 |
| 0 | cardiovasc_death_rate | 0.235917 |
| 5 | aged_65_older | 0.061143 |
| 3 | male_smokers | 0.032189 |
| 2 | female_smokers | 0.026247 |
| 4 | life_expectancy | 0.022046 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2063 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.988544640680141
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0009590270117916058 R2 Score: 0.9951105418033505 RMSE: 0.030968 Entropy Value: 0.0017192056737414868
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.542660 |
| 0 | hospital_beds_per_thousand | 0.171023 |
| 2 | extreme_poverty | 0.106136 |
| 3 | gdp_per_capita | 0.073260 |
| 4 | population_density | 0.055552 |
| 5 | population | 0.051370 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 0.491388 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987342751999154
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0012823484938275537 R2 Score: 0.9994384293418184 RMSE: 0.035810 Entropy Value: 0.000229301840245004
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | aged_65_older | 0.747254 |
| 0 | cardiovasc_death_rate | 0.176164 |
| 1 | diabetes_prevalence | 0.054523 |
| 2 | female_smokers | 0.012670 |
| 4 | life_expectancy | 0.005541 |
| 3 | male_smokers | 0.003848 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9986051830769147
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0015122598797065847 R2 Score: 0.9993377457219498 RMSE: 0.038888 Entropy Value: 0.0003022118010805689
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.715384 |
| 1 | human_development_index | 0.205967 |
| 0 | hospital_beds_per_thousand | 0.052834 |
| 2 | extreme_poverty | 0.024062 |
| 3 | gdp_per_capita | 0.001458 |
| 4 | population_density | 0.000295 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 1.084791 |
2136 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9552082375777268
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 2.003468587162617 R2 Score: 0.9173883860485977 RMSE: 1.415439 Entropy Value: 0.008328296781842388
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.466369 |
| 1 | diabetes_prevalence | 0.261525 |
| 5 | aged_65_older | 0.146323 |
| 2 | female_smokers | 0.114804 |
| 4 | life_expectancy | 0.007744 |
| 3 | male_smokers | 0.003235 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2136 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9566213518011253
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.713063858863409 R2 Score: 0.9705973147727024 RMSE: 0.844431 Entropy Value: 0.008268179051644176
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.302760 |
| 1 | human_development_index | 0.227340 |
| 5 | population | 0.207927 |
| 4 | population_density | 0.146404 |
| 2 | extreme_poverty | 0.077408 |
| 3 | gdp_per_capita | 0.038161 |
# Country Pair by Pair Analysis relative to population density
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on population density (13 pairs of countries)
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Canada = df[(df.location == "Canada")]
df_Estonia = df[(df.location == "Estonia")]
df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Latvia = df[(df.location == "Latvia")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_Spain = df[(df.location == "Spain")]
df_Sweden = df[(df.location == "Sweden")]
df_UnitedStates = df[(df.location == "United States")]
df_Austria = df[(df.location == "Austria")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Czechia = df[(df.location == "Czechia")]
df_Denmark = df[(df.location == "Denmark")]
df_France = df[(df.location == "France")]
df_Portugal = df[(df.location == "Portugal")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Belgium = df[(df.location == "Belgium")]
df_Italy = df[(df.location == "Italy")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Canada'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9773211046544897
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004743237035315944 R2 Score: 0.9984501610588699 RMSE: 0.068871 Entropy Value: 0.0003533082890357041
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.565595 |
| 0 | cardiovasc_death_rate | 0.248605 |
| 5 | aged_65_older | 0.120960 |
| 2 | female_smokers | 0.026608 |
| 4 | life_expectancy | 0.015588 |
| 3 | male_smokers | 0.014215 |
| 6 | median_age | 0.008429 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Canada'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.500 | 0.929 | 0.5 | 44017.591 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.500 | 0.929 | 0.5 | 44017.591 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.500 | 0.929 | 0.5 | 44017.591 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.500 | 0.929 | 0.5 | 44017.591 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.500 | 0.929 | 0.5 | 44017.591 | 38454328 | 1.093162 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9762845281500339
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00798631766145176 R2 Score: 0.9973904938724765 RMSE: 0.089366 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.477353 |
| 0 | hospital_beds_per_thousand | 0.366962 |
| 4 | population | 0.082307 |
| 2 | extreme_poverty | 0.054872 |
| 3 | gdp_per_capita | 0.018506 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Finland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
2127 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.996967666861275
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002243279651619481 R2 Score: 0.9979993463101589 RMSE: 0.047363 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.911358 |
| 5 | aged_65_older | 0.038886 |
| 0 | cardiovasc_death_rate | 0.022946 |
| 2 | female_smokers | 0.011241 |
| 6 | median_age | 0.008843 |
| 3 | male_smokers | 0.004015 |
| 4 | life_expectancy | 0.002711 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Finland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 1326064 | 0.00000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 1326064 | 0.00000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 1326064 | 0.00000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 1326064 | 0.00000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 1326064 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 5540745 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 5540745 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 5540745 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 5540745 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 5540745 | 0.55159 |
2127 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9961922448004852
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002853827219619632 R2 Score: 0.9974548336169414 RMSE: 0.053421 Entropy Value: 0.0013584415315992247
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.892065 |
| 2 | extreme_poverty | 0.046066 |
| 0 | hospital_beds_per_thousand | 0.034299 |
| 3 | gdp_per_capita | 0.022410 |
| 4 | population | 0.005159 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Iceland'
country2 = 'Ireland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.00000 |
| 18839 | Ireland | 3/1/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.00000 |
| 18840 | Ireland | 3/2/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.00000 |
| 18841 | Ireland | 3/3/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.00000 |
| 18842 | Ireland | 3/4/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
2071 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990289844187075
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0016794259329685336 R2 Score: 0.9993768015022259 RMSE: 0.040981 Entropy Value: 0.00039991375165087696
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.656345 |
| 5 | aged_65_older | 0.175855 |
| 0 | cardiovasc_death_rate | 0.130769 |
| 6 | median_age | 0.021844 |
| 2 | female_smokers | 0.009989 |
| 4 | life_expectancy | 0.004495 |
| 3 | male_smokers | 0.000703 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Iceland'
country2 = 'Ireland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 5023108 | 0.00000 |
| 18839 | Ireland | 3/1/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 5023108 | 0.00000 |
| 18840 | Ireland | 3/2/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 5023108 | 0.00000 |
| 18841 | Ireland | 3/3/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 5023108 | 0.00000 |
| 18842 | Ireland | 3/4/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 5023108 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.2 | 46482.958 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.2 | 46482.958 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.2 | 46482.958 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.2 | 46482.958 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.2 | 46482.958 | 372903 | 0.11011 |
2071 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9976029122392246
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005769156505449668 R2 Score: 0.9978591912884988 RMSE: 0.075955 Entropy Value: 0.002695782621398591
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.588998 |
| 0 | hospital_beds_per_thousand | 0.268803 |
| 2 | extreme_poverty | 0.050567 |
| 3 | gdp_per_capita | 0.049935 |
| 4 | population | 0.041696 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Romania'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 17800 | Romania | 2/26/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17801 | Romania | 2/27/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17802 | Romania | 2/28/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17803 | Romania | 2/29/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17804 | Romania | 3/1/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2076 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984971016962156
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0022732963920699734 R2 Score: 0.9984516084013416 RMSE: 0.047679 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.373051 |
| 0 | cardiovasc_death_rate | 0.370197 |
| 6 | median_age | 0.174235 |
| 5 | aged_65_older | 0.065719 |
| 2 | female_smokers | 0.014828 |
| 3 | male_smokers | 0.001702 |
| 4 | life_expectancy | 0.000267 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Romania'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 17800 | Romania | 2/26/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 19659270 | 0.000000 |
| 17801 | Romania | 2/27/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 19659270 | 0.000000 |
| 17802 | Romania | 2/28/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 19659270 | 0.000000 |
| 17803 | Romania | 2/29/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 19659270 | 0.000000 |
| 17804 | Romania | 3/1/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 19659270 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 1850654 | 0.631969 |
2076 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9969490880205945
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00261187990421846 R2 Score: 0.9982209918097331 RMSE: 0.051107 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.655538 |
| 1 | human_development_index | 0.257969 |
| 2 | extreme_poverty | 0.059863 |
| 3 | gdp_per_capita | 0.025707 |
| 4 | population | 0.000922 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Spain'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
2101 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.998793696055355
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0068977309284879535 R2 Score: 0.9988942023582971 RMSE: 0.083053 Entropy Value: 0.000581703964477074
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.588572 |
| 5 | aged_65_older | 0.190679 |
| 0 | cardiovasc_death_rate | 0.163446 |
| 6 | median_age | 0.036807 |
| 2 | female_smokers | 0.016779 |
| 3 | male_smokers | 0.003627 |
| 4 | life_expectancy | 0.000090 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Spain'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 6871547 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 6871547 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 6871547 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 6871547 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 6871547 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.970 | 0.904 | 1.00 | 34272.360 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.970 | 0.904 | 1.00 | 34272.360 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.970 | 0.904 | 1.00 | 34272.360 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.970 | 0.904 | 1.00 | 34272.360 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.970 | 0.904 | 1.00 | 34272.360 | 47558632 | 0.855148 |
2101 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9985077942246274
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.011012684782410768 R2 Score: 0.9982345207449435 RMSE: 0.104941 Entropy Value: 0.0011198641869641568
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.575847 |
| 0 | hospital_beds_per_thousand | 0.191762 |
| 2 | extreme_poverty | 0.164986 |
| 3 | gdp_per_capita | 0.035551 |
| 4 | population | 0.031854 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 23011 | Sweden | 2/1/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23012 | Sweden | 2/2/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23013 | Sweden | 2/3/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23014 | Sweden | 2/4/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23015 | Sweden | 2/5/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2136 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9977617836323432
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.06163465865693719 R2 Score: 0.9881701775260999 RMSE: 0.248263 Entropy Value: 0.002275727343611559
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.757595 |
| 0 | cardiovasc_death_rate | 0.141108 |
| 5 | aged_65_older | 0.063089 |
| 2 | female_smokers | 0.020350 |
| 3 | male_smokers | 0.009161 |
| 6 | median_age | 0.007329 |
| 4 | life_expectancy | 0.001368 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 23011 | Sweden | 2/1/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 10549349 | 0.000000 |
| 23012 | Sweden | 2/2/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 10549349 | 0.000000 |
| 23013 | Sweden | 2/3/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 10549349 | 0.000000 |
| 23014 | Sweden | 2/4/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 10549349 | 0.000000 |
| 23015 | Sweden | 2/5/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 10549349 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 338289856 | 1.084791 |
2136 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9964446927359324
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.05830853179428446 R2 Score: 0.9888085762966661 RMSE: 0.241472 Entropy Value: 0.002494604791450158
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.556230 |
| 2 | extreme_poverty | 0.233704 |
| 0 | hospital_beds_per_thousand | 0.101283 |
| 3 | gdp_per_capita | 0.097960 |
| 4 | population | 0.010821 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Cyprus'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4148 | Cyprus | 12/25/2022 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.199679 |
| 4149 | Cyprus | 12/26/2022 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.199679 |
| 4150 | Cyprus | 12/27/2022 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.199679 |
| 4151 | Cyprus | 12/28/2022 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.199679 |
| 4152 | Cyprus | 12/29/2022 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.199679 |
2066 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9977294625245022
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0011518898943072675 R2 Score: 0.9989790639681787 RMSE: 0.033940 Entropy Value: 0.0004452189828876654
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.681072 |
| 0 | cardiovasc_death_rate | 0.265483 |
| 2 | female_smokers | 0.021585 |
| 6 | median_age | 0.015140 |
| 5 | aged_65_older | 0.014290 |
| 3 | male_smokers | 0.001557 |
| 4 | life_expectancy | 0.000873 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Cyprus'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4148 | Cyprus | 12/25/2022 | 3.40 | 0.887 | 0.15 | 32415.132 | 896007 | 0.199679 |
| 4149 | Cyprus | 12/26/2022 | 3.40 | 0.887 | 0.15 | 32415.132 | 896007 | 0.199679 |
| 4150 | Cyprus | 12/27/2022 | 3.40 | 0.887 | 0.15 | 32415.132 | 896007 | 0.199679 |
| 4151 | Cyprus | 12/28/2022 | 3.40 | 0.887 | 0.15 | 32415.132 | 896007 | 0.199679 |
| 4152 | Cyprus | 12/29/2022 | 3.40 | 0.887 | 0.15 | 32415.132 | 896007 | 0.199679 |
2066 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9936138289842674
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003065469000387181 R2 Score: 0.9972830321957044 RMSE: 0.055367 Entropy Value: 0.0009279980175255162
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.602723 |
| 0 | hospital_beds_per_thousand | 0.253940 |
| 4 | population | 0.063896 |
| 2 | extreme_poverty | 0.044890 |
| 3 | gdp_per_capita | 0.034551 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.229131 |
2096 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990768399961493
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002233914255848466 R2 Score: 0.9981453469995119 RMSE: 0.047264 Entropy Value: 0.0004229517361525248
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.817142 |
| 5 | aged_65_older | 0.058152 |
| 0 | cardiovasc_death_rate | 0.051193 |
| 6 | median_age | 0.048701 |
| 2 | female_smokers | 0.023894 |
| 3 | male_smokers | 0.000842 |
| 4 | life_expectancy | 0.000075 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 10493990 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 10493990 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 10493990 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 10493990 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 10493990 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 5882259 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 5882259 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 5882259 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 5882259 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 5882259 | 0.229131 |
2096 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984852360468537
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002011261480005173 R2 Score: 0.9983301990535706 RMSE: 0.044847 Entropy Value: 0.0005490250491042806
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.848978 |
| 0 | hospital_beds_per_thousand | 0.066612 |
| 3 | gdp_per_capita | 0.039738 |
| 2 | extreme_poverty | 0.037550 |
| 4 | population | 0.007122 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Portugal'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8377 | France | 1/25/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8378 | France | 1/26/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8379 | France | 1/27/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8380 | France | 1/28/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11513 | Portugal | 12/25/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11514 | Portugal | 12/26/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11515 | Portugal | 12/27/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11516 | Portugal | 12/28/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11517 | Portugal | 12/29/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
2105 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9957831690613299
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.047477141309076605 R2 Score: 0.9958871286533991 RMSE: 0.217892 Entropy Value: 0.0012934258184287219
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.565073 |
| 0 | cardiovasc_death_rate | 0.373921 |
| 6 | median_age | 0.035919 |
| 2 | female_smokers | 0.010778 |
| 3 | male_smokers | 0.005120 |
| 4 | life_expectancy | 0.004886 |
| 5 | aged_65_older | 0.004303 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Portugal'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 67813000 | 0.000000 |
| 8377 | France | 1/25/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 67813000 | 0.000000 |
| 8378 | France | 1/26/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 67813000 | 0.000000 |
| 8379 | France | 1/27/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 67813000 | 0.000000 |
| 8380 | France | 1/28/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 67813000 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11513 | Portugal | 12/25/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 10270857 | 0.462977 |
| 11514 | Portugal | 12/26/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 10270857 | 0.462977 |
| 11515 | Portugal | 12/27/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 10270857 | 0.462977 |
| 11516 | Portugal | 12/28/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 10270857 | 0.462977 |
| 11517 | Portugal | 12/29/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 10270857 | 0.462977 |
2105 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9931385478865573
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0845928198808711 R2 Score: 0.9926718548037413 RMSE: 0.290848 Entropy Value: 0.004828902175062184
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.615555 |
| 4 | population | 0.159129 |
| 0 | hospital_beds_per_thousand | 0.114584 |
| 2 | extreme_poverty | 0.090604 |
| 3 | gdp_per_capita | 0.020128 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2091 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984789767307959
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0027083668308768272 R2 Score: 0.9986712195046868 RMSE: 0.052042 Entropy Value: 0.00047135693736216383
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.593303 |
| 1 | diabetes_prevalence | 0.295471 |
| 0 | cardiovasc_death_rate | 0.053646 |
| 5 | aged_65_older | 0.050607 |
| 4 | life_expectancy | 0.003674 |
| 2 | female_smokers | 0.002363 |
| 3 | male_smokers | 0.000937 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 2119843 | 0.536669 |
2091 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9977419513278931
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00345193732286861 R2 Score: 0.9983064085214091 RMSE: 0.058753 Entropy Value: 0.0008641336305743049
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.766836 |
| 0 | hospital_beds_per_thousand | 0.123667 |
| 2 | extreme_poverty | 0.091805 |
| 3 | gdp_per_capita | 0.015496 |
| 4 | population | 0.002195 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2124 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9992146796046744
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.017406911489324793 R2 Score: 0.9991022590867014 RMSE: 0.131935 Entropy Value: 0.00043772807452064234
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.549617 |
| 5 | aged_65_older | 0.265471 |
| 1 | diabetes_prevalence | 0.155939 |
| 2 | female_smokers | 0.025952 |
| 6 | median_age | 0.001582 |
| 3 | male_smokers | 0.001153 |
| 4 | life_expectancy | 0.000287 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 11655923 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 11655923 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 11655923 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 11655923 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 11655923 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 59037472 | 0.735109 |
2124 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9985734833226066
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.09706029058839453 R2 Score: 0.9949942300808909 RMSE: 0.311545 Entropy Value: 0.002419033420494509
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.718673 |
| 3 | gdp_per_capita | 0.148969 |
| 2 | extreme_poverty | 0.112091 |
| 0 | hospital_beds_per_thousand | 0.011429 |
| 4 | population | 0.008838 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
2078 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991149035005054
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0032604762329485467 R2 Score: 0.9995750530257127 RMSE: 0.057101 Entropy Value: 0.0004307174874771804
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.562829 |
| 0 | cardiovasc_death_rate | 0.297571 |
| 5 | aged_65_older | 0.104378 |
| 2 | female_smokers | 0.016723 |
| 6 | median_age | 0.015953 |
| 3 | male_smokers | 0.002131 |
| 4 | life_expectancy | 0.000416 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 647601 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 647601 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 647601 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 647601 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 647601 | 0.377872 |
2078 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9977757658046478
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009316482430350636 R2 Score: 0.998785756822954 RMSE: 0.096522 Entropy Value: 0.001636704915774484
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.846066 |
| 2 | extreme_poverty | 0.097140 |
| 0 | hospital_beds_per_thousand | 0.048313 |
| 3 | gdp_per_capita | 0.007219 |
| 4 | population | 0.001262 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United Kingdom'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322149 |
2102 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9534896177522434
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 3.3003716461415755 R2 Score: 0.8696787142284871 RMSE: 1.816693 Entropy Value: 0.010589221412772479
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.453094 |
| 1 | diabetes_prevalence | 0.312006 |
| 2 | female_smokers | 0.152936 |
| 6 | median_age | 0.034726 |
| 5 | aged_65_older | 0.026498 |
| 4 | life_expectancy | 0.015497 |
| 3 | male_smokers | 0.005243 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United Kingdom'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 8740471 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 8740471 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 8740471 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 8740471 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 8740471 | 0.322149 |
2102 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9514721454520239
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 1.512484015018286 R2 Score: 0.9402767679886957 RMSE: 1.229831 Entropy Value: 0.0070634068971487385
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.470955 |
| 2 | extreme_poverty | 0.184488 |
| 4 | population | 0.141774 |
| 0 | hospital_beds_per_thousand | 0.118008 |
| 3 | gdp_per_capita | 0.084775 |
# Country Pair by Pair Analysis relative to hospital beds per thousand
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on hospital beds per thousand (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Czechia = df[(df.location == "Czechia")]
df_France = df[(df.location == "France")]
df_Romania = df[(df.location == "Romania")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Belgium = df[(df.location == "Belgium")]
df_Estonia = df[(df.location == "Estonia")]
df_Latvia = df[(df.location == "Latvia")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Switzerland = df[(df.location == "Switzerland")]
df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Portugal = df[(df.location == "Portugal")]
df_Spain = df[(df.location == "Spain")]
df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Bulgaria'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3121 | Bulgaria | 12/25/2022 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 2.949845 |
| 3122 | Bulgaria | 12/26/2022 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 2.950107 |
| 3123 | Bulgaria | 12/27/2022 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 2.949883 |
| 3124 | Bulgaria | 12/28/2022 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 2.949716 |
| 3125 | Bulgaria | 12/29/2022 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 2.949605 |
2066 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.8}
Best CV score: 0.9390150883353605
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0044848062013056154 R2 Score: 0.9975309010841117 RMSE: 0.066969 Entropy Value: 0.000715458116492232
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | aged_65_older | 0.306596 |
| 0 | cardiovasc_death_rate | 0.216755 |
| 1 | diabetes_prevalence | 0.208843 |
| 6 | median_age | 0.148879 |
| 2 | female_smokers | 0.054899 |
| 3 | male_smokers | 0.042820 |
| 4 | life_expectancy | 0.021207 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Bulgaria'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3121 | Bulgaria | 12/25/2022 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 2.949845 |
| 3122 | Bulgaria | 12/26/2022 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 2.950107 |
| 3123 | Bulgaria | 12/27/2022 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 2.949883 |
| 3124 | Bulgaria | 12/28/2022 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 2.949716 |
| 3125 | Bulgaria | 12/29/2022 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 2.949605 |
2066 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9274228684091804
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004503341096560387 R2 Score: 0.9975206967435615 RMSE: 0.067107 Entropy Value: 0.0004825642819307405
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | human_development_index | 0.710086 |
| 1 | extreme_poverty | 0.179048 |
| 4 | population | 0.041480 |
| 2 | gdp_per_capita | 0.040024 |
| 3 | population_density | 0.029363 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'France'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411710 |
| 9443 | France | 12/26/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411282 |
| 9444 | France | 12/27/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411730 |
| 9445 | France | 12/28/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411813 |
| 9446 | France | 12/29/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411892 |
2105 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9958649994549903
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.09823454258048223 R2 Score: 0.9905286905291042 RMSE: 0.313424 Entropy Value: 0.001940918275872383
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.486330 |
| 5 | aged_65_older | 0.411606 |
| 1 | diabetes_prevalence | 0.081625 |
| 3 | male_smokers | 0.006524 |
| 2 | female_smokers | 0.006491 |
| 6 | median_age | 0.005829 |
| 4 | life_expectancy | 0.001595 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'France'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 0.900 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 0.900 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 0.900 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 0.900 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 0.900 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411710 |
| 9443 | France | 12/26/2022 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411282 |
| 9444 | France | 12/27/2022 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411730 |
| 9445 | France | 12/28/2022 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411813 |
| 9446 | France | 12/29/2022 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411892 |
2105 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9945372413226707
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.06076319269700281 R2 Score: 0.9941415006640716 RMSE: 0.246502 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.600011 |
| 4 | population | 0.135056 |
| 2 | gdp_per_capita | 0.131486 |
| 0 | human_development_index | 0.103245 |
| 3 | population_density | 0.030201 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Slovakia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.85 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
2067 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987386723526734
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0012543326875114565 R2 Score: 0.9992927222966834 RMSE: 0.035417 Entropy Value: 0.0001720479338483219
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.533194 |
| 1 | diabetes_prevalence | 0.296478 |
| 5 | aged_65_older | 0.155072 |
| 2 | female_smokers | 0.010748 |
| 6 | median_age | 0.003793 |
| 3 | male_smokers | 0.000546 |
| 4 | life_expectancy | 0.000168 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Slovakia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2067 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9968518647339172
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0027447047876360095 R2 Score: 0.9984523495896989 RMSE: 0.052390 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | human_development_index | 0.742119 |
| 1 | extreme_poverty | 0.205208 |
| 2 | gdp_per_capita | 0.027011 |
| 3 | population_density | 0.021735 |
| 4 | population | 0.003927 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Estonia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
2121 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989923831042263
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.013599164509749826 R2 Score: 0.9989226161535332 RMSE: 0.116615 Entropy Value: 0.0008484654832886927
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.855940 |
| 0 | cardiovasc_death_rate | 0.105436 |
| 2 | female_smokers | 0.022113 |
| 5 | aged_65_older | 0.008940 |
| 6 | median_age | 0.007059 |
| 3 | male_smokers | 0.000455 |
| 4 | life_expectancy | 0.000058 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Estonia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
2121 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9981079962252715
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.04174307689414808 R2 Score: 0.9966929353111853 RMSE: 0.204311 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.595773 |
| 2 | gdp_per_capita | 0.261597 |
| 0 | human_development_index | 0.106863 |
| 3 | population_density | 0.034785 |
| 4 | population | 0.000981 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2079 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9988494053319295
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0014364972839239708 R2 Score: 0.9963828762121265 RMSE: 0.037901 Entropy Value: 0.0004385904063229603
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.760923 |
| 6 | median_age | 0.172695 |
| 0 | cardiovasc_death_rate | 0.035559 |
| 2 | female_smokers | 0.015432 |
| 5 | aged_65_older | 0.010307 |
| 3 | male_smokers | 0.004540 |
| 4 | life_expectancy | 0.000544 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2079 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9974607843120801
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0028344280543466877 R2 Score: 0.9928628635395763 RMSE: 0.053239 Entropy Value: 0.00104898720029077
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.696631 |
| 2 | gdp_per_capita | 0.117422 |
| 3 | population_density | 0.094589 |
| 0 | human_development_index | 0.082520 |
| 4 | population | 0.008839 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2100 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979858035568876
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002574183795502055 R2 Score: 0.9985128745675512 RMSE: 0.050736 Entropy Value: 0.0008625859343290145
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.477247 |
| 6 | median_age | 0.226017 |
| 0 | cardiovasc_death_rate | 0.198241 |
| 5 | aged_65_older | 0.067375 |
| 2 | female_smokers | 0.018691 |
| 3 | male_smokers | 0.008959 |
| 4 | life_expectancy | 0.003470 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 0.917 | 0.00 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 0.917 | 0.00 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 0.917 | 0.00 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 0.917 | 0.00 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 0.917 | 0.00 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2100 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9980947692896143
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004439908764146692 R2 Score: 0.9974350311533886 RMSE: 0.066633 Entropy Value: 0.001278250012959715
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.582591 |
| 0 | human_development_index | 0.184401 |
| 2 | gdp_per_capita | 0.111431 |
| 4 | population | 0.074040 |
| 3 | population_density | 0.047537 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'Canada'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2111 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9993402391390793
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004670392095474607 R2 Score: 0.9985772312539217 RMSE: 0.068340 Entropy Value: 0.0007126326576069527
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.860369 |
| 0 | cardiovasc_death_rate | 0.077197 |
| 5 | aged_65_older | 0.022825 |
| 6 | median_age | 0.020826 |
| 2 | female_smokers | 0.017709 |
| 3 | male_smokers | 0.000843 |
| 4 | life_expectancy | 0.000231 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'Canada'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2111 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9982663840319784
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008554909392847082 R2 Score: 0.9973938681248052 RMSE: 0.092493 Entropy Value: 0.0016355829056392212
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.725877 |
| 2 | gdp_per_capita | 0.104252 |
| 3 | population_density | 0.097611 |
| 0 | human_development_index | 0.063116 |
| 4 | population | 0.009144 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.229131 |
2089 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9981664102171948
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.000955806320139703 R2 Score: 0.9992193301724611 RMSE: 0.030916 Entropy Value: 0.0005947775597474582
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.471901 |
| 5 | aged_65_older | 0.362727 |
| 1 | diabetes_prevalence | 0.138179 |
| 0 | cardiovasc_death_rate | 0.013290 |
| 2 | female_smokers | 0.007320 |
| 3 | male_smokers | 0.003781 |
| 4 | life_expectancy | 0.002802 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.229131 |
2089 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9978802115578003
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002391339100663906 R2 Score: 0.998046836221977 RMSE: 0.048901 Entropy Value: 0.001094689244735057
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.823494 |
| 0 | human_development_index | 0.106954 |
| 2 | gdp_per_capita | 0.050921 |
| 3 | population_density | 0.013679 |
| 4 | population | 0.004952 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
2102 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9967115293459013
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002631359506918055 R2 Score: 0.9977846875396988 RMSE: 0.051297 Entropy Value: 0.0008231147347694647
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.424154 |
| 0 | cardiovasc_death_rate | 0.346029 |
| 2 | female_smokers | 0.087274 |
| 5 | aged_65_older | 0.081840 |
| 6 | median_age | 0.050001 |
| 3 | male_smokers | 0.008944 |
| 4 | life_expectancy | 0.001758 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2102 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9954359249296372
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004634197824994556 R2 Score: 0.9960985201154687 RMSE: 0.068075 Entropy Value: 0.0016279455279392522
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.567476 |
| 2 | gdp_per_capita | 0.207296 |
| 0 | human_development_index | 0.151705 |
| 3 | population_density | 0.054498 |
| 4 | population | 0.019025 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9992082203444431
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008136783743045974 R2 Score: 0.9993315447885287 RMSE: 0.090204 Entropy Value: 0.0003728532754072914
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | aged_65_older | 0.368700 |
| 1 | diabetes_prevalence | 0.257725 |
| 0 | cardiovasc_death_rate | 0.220660 |
| 6 | median_age | 0.135502 |
| 2 | female_smokers | 0.015439 |
| 4 | life_expectancy | 0.001092 |
| 3 | male_smokers | 0.000882 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9977141803352069
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.022014288790731012 R2 Score: 0.9981914763211477 RMSE: 0.148372 Entropy Value: 0.0014257512531825832
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.605823 |
| 0 | human_development_index | 0.196309 |
| 3 | population_density | 0.107973 |
| 2 | gdp_per_capita | 0.076505 |
| 4 | population | 0.013390 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Portugal'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11513 | Portugal | 12/25/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11514 | Portugal | 12/26/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11515 | Portugal | 12/27/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11516 | Portugal | 12/28/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11517 | Portugal | 12/29/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
2071 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9990692627631146
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0038690919886303294 R2 Score: 0.9994973666749689 RMSE: 0.062202 Entropy Value: 0.00024895544762980527
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.506845 |
| 0 | cardiovasc_death_rate | 0.422379 |
| 6 | median_age | 0.036212 |
| 2 | female_smokers | 0.026778 |
| 3 | male_smokers | 0.004003 |
| 5 | aged_65_older | 0.003742 |
| 4 | life_expectancy | 0.000041 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Portugal'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11513 | Portugal | 12/25/2022 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11514 | Portugal | 12/26/2022 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11515 | Portugal | 12/27/2022 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11516 | Portugal | 12/28/2022 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11517 | Portugal | 12/29/2022 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.462977 |
2071 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9978978820744411
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.011370657753647056 R2 Score: 0.9985228390714666 RMSE: 0.106633 Entropy Value: 0.0014961642094780694
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.691019 |
| 2 | gdp_per_capita | 0.177634 |
| 0 | human_development_index | 0.087000 |
| 3 | population_density | 0.041211 |
| 4 | population | 0.003136 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Sweden'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 23011 | Sweden | 2/1/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23012 | Sweden | 2/2/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23013 | Sweden | 2/3/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23014 | Sweden | 2/4/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23015 | Sweden | 2/5/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
2126 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989620255205065
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.013116878471431913 R2 Score: 0.9984614298768997 RMSE: 0.114529 Entropy Value: 0.0007355015755679387
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.917876 |
| 2 | female_smokers | 0.040045 |
| 0 | cardiovasc_death_rate | 0.020071 |
| 6 | median_age | 0.012055 |
| 5 | aged_65_older | 0.006507 |
| 3 | male_smokers | 0.002973 |
| 4 | life_expectancy | 0.000473 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Sweden'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 23011 | Sweden | 2/1/2020 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23012 | Sweden | 2/2/2020 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23013 | Sweden | 2/3/2020 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23014 | Sweden | 2/4/2020 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23015 | Sweden | 2/5/2020 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
2126 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987146553286387
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01981169414057557 R2 Score: 0.9976761482726947 RMSE: 0.140754 Entropy Value: 0.0009592747648842666
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.926065 |
| 2 | gdp_per_capita | 0.056652 |
| 3 | population_density | 0.011133 |
| 0 | human_development_index | 0.005290 |
| 4 | population | 0.000861 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2136 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9573756200168303
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 3.067522668688002 R2 Score: 0.8735128665771991 RMSE: 1.751434 Entropy Value: 0.00997828364305316
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.515435 |
| 1 | diabetes_prevalence | 0.265632 |
| 2 | female_smokers | 0.128937 |
| 5 | aged_65_older | 0.040833 |
| 6 | median_age | 0.026848 |
| 4 | life_expectancy | 0.013474 |
| 3 | male_smokers | 0.008841 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2136 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9551791807835526
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 1.4775238366856351 R2 Score: 0.9390753468347937 RMSE: 1.215534 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.360283 |
| 4 | population | 0.281991 |
| 0 | human_development_index | 0.175041 |
| 2 | gdp_per_capita | 0.135475 |
| 3 | population_density | 0.047210 |
# Country Pair by Pair Analysis relative to human development index
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on human development index (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Sweden = df[(df.location == "Sweden")]
df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Czechia = df[(df.location == "Czechia")]
df_Estonia = df[(df.location == "Estonia")]
df_France = df[(df.location == "France")]
df_Italy = df[(df.location == "Italy")]
df_Latvia = df[(df.location == "Latvia")]
df_Portugal = df[(df.location == "Portugal")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Spain = df[(df.location == "Spain")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9985877971766527
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005399476083504289 R2 Score: 0.9995439543541608 RMSE: 0.073481 Entropy Value: 0.0003649161068700861
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.846396 |
| 0 | cardiovasc_death_rate | 0.062219 |
| 1 | diabetes_prevalence | 0.061911 |
| 5 | aged_65_older | 0.025860 |
| 2 | female_smokers | 0.002106 |
| 3 | male_smokers | 0.001366 |
| 4 | life_expectancy | 0.000142 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 5.64 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 5.64 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 5.64 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 5.64 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 5.64 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979482681643022
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.017351533220202466 R2 Score: 0.9985344705576373 RMSE: 0.131725 Entropy Value: 0.0015777407587553624
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.700511 |
| 2 | gdp_per_capita | 0.139220 |
| 0 | hospital_beds_per_thousand | 0.123668 |
| 3 | population_density | 0.032006 |
| 4 | population | 0.004595 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2134 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9992606850560544
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0025729522227882153 R2 Score: 0.9993859368044151 RMSE: 0.050724 Entropy Value: 0.00030984264476932686
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.665696 |
| 6 | median_age | 0.173524 |
| 0 | cardiovasc_death_rate | 0.127952 |
| 5 | aged_65_older | 0.019835 |
| 2 | female_smokers | 0.011763 |
| 3 | male_smokers | 0.001135 |
| 4 | life_expectancy | 0.000095 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.5 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 2.5 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 2.5 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 2.5 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 2.5 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.5 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.5 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.5 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.5 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.5 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2134 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9981308764210667
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006611715252313853 R2 Score: 0.998422041824106 RMSE: 0.081312 Entropy Value: 0.001601041202106293
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.723446 |
| 0 | hospital_beds_per_thousand | 0.146505 |
| 2 | gdp_per_capita | 0.065728 |
| 3 | population_density | 0.048554 |
| 4 | population | 0.015767 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
2102 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9967115293459013
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002631359506918055 R2 Score: 0.9977846875396988 RMSE: 0.051297 Entropy Value: 0.0008231147347694647
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.424154 |
| 0 | cardiovasc_death_rate | 0.346029 |
| 2 | female_smokers | 0.087274 |
| 5 | aged_65_older | 0.081840 |
| 6 | median_age | 0.050001 |
| 3 | male_smokers | 0.008944 |
| 4 | life_expectancy | 0.001758 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 3.28 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 3.28 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 3.28 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 3.28 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 3.28 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2102 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9954359249296372
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004634197824994556 R2 Score: 0.9960985201154687 RMSE: 0.068075 Entropy Value: 0.0016279455279392522
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.567476 |
| 2 | gdp_per_capita | 0.207296 |
| 0 | hospital_beds_per_thousand | 0.151705 |
| 3 | population_density | 0.054498 |
| 4 | population | 0.019025 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
2076 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987521060977691
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0015514128412379164 R2 Score: 0.9993205997164117 RMSE: 0.039388 Entropy Value: 0.0003261513916954156
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.681071 |
| 5 | aged_65_older | 0.213986 |
| 0 | cardiovasc_death_rate | 0.084737 |
| 1 | diabetes_prevalence | 0.013333 |
| 2 | female_smokers | 0.006390 |
| 3 | male_smokers | 0.000304 |
| 4 | life_expectancy | 0.000178 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 4.51 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 4.51 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 4.51 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 4.51 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 4.51 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 2.96 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 2.96 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 2.96 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 2.96 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 2.96 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
2076 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9951141561453272
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00468197321349643 R2 Score: 0.9979496534742719 RMSE: 0.068425 Entropy Value: 0.0012251208567780243
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.741234 |
| 0 | hospital_beds_per_thousand | 0.114263 |
| 2 | gdp_per_capita | 0.070959 |
| 4 | population | 0.047536 |
| 3 | population_density | 0.026008 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9992039584453336
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005761184416829001 R2 Score: 0.9992968245245488 RMSE: 0.075902 Entropy Value: 0.0004786331730612395
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.564395 |
| 6 | median_age | 0.214805 |
| 5 | aged_65_older | 0.195681 |
| 2 | female_smokers | 0.016241 |
| 0 | cardiovasc_death_rate | 0.005649 |
| 3 | male_smokers | 0.002997 |
| 4 | life_expectancy | 0.000231 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Slovenia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9982367909239199
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008784049368404725 R2 Score: 0.9989278718325746 RMSE: 0.093723 Entropy Value: 0.0011132809856388346
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.751029 |
| 2 | gdp_per_capita | 0.152480 |
| 0 | hospital_beds_per_thousand | 0.088799 |
| 3 | population_density | 0.004278 |
| 4 | population | 0.003415 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Switzerland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.816005 |
2102 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9983978600106347
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.016940840920362982 R2 Score: 0.99676348032928 RMSE: 0.130157 Entropy Value: 0.0007413159900353029
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.816403 |
| 6 | median_age | 0.105381 |
| 5 | aged_65_older | 0.045172 |
| 0 | cardiovasc_death_rate | 0.018481 |
| 2 | female_smokers | 0.009792 |
| 3 | male_smokers | 0.004547 |
| 4 | life_expectancy | 0.000225 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Switzerland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 4.53 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 4.53 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 4.53 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 4.53 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 4.53 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2102 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9980805378155827
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.016667254115702308 R2 Score: 0.9968157486363314 RMSE: 0.129102 Entropy Value: 0.001646613884826754
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.659676 |
| 2 | gdp_per_capita | 0.190431 |
| 0 | hospital_beds_per_thousand | 0.124054 |
| 3 | population_density | 0.022797 |
| 4 | population | 0.003043 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2136 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9573756200168303
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 3.067522668688002 R2 Score: 0.8735128665771991 RMSE: 1.751434 Entropy Value: 0.00997828364305316
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.515435 |
| 1 | diabetes_prevalence | 0.265632 |
| 2 | female_smokers | 0.128937 |
| 5 | aged_65_older | 0.040833 |
| 6 | median_age | 0.026848 |
| 4 | life_expectancy | 0.013474 |
| 3 | male_smokers | 0.008841 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.2 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.2 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.2 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2136 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9551791807835526
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 1.4775238366856351 R2 Score: 0.9390753468347937 RMSE: 1.215534 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.360283 |
| 4 | population | 0.281991 |
| 0 | hospital_beds_per_thousand | 0.175041 |
| 2 | gdp_per_capita | 0.135475 |
| 3 | population_density | 0.047210 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Czechia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919575 |
2061 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9968265232047949
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.000978608688705832 R2 Score: 0.9982973886897886 RMSE: 0.031283 Entropy Value: 0.0005033828926016659
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.658933 |
| 0 | cardiovasc_death_rate | 0.184826 |
| 5 | aged_65_older | 0.094490 |
| 6 | median_age | 0.029701 |
| 2 | female_smokers | 0.024293 |
| 3 | male_smokers | 0.006019 |
| 4 | life_expectancy | 0.001737 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Czechia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 6.63 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 6.63 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 6.63 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 6.63 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 6.63 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.919575 |
2061 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9943317261946557
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0027632388218584106 R2 Score: 0.9951924382797649 RMSE: 0.052567 Entropy Value: 0.001055748763209947
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.462078 |
| 0 | hospital_beds_per_thousand | 0.393009 |
| 2 | gdp_per_capita | 0.063780 |
| 4 | population | 0.049975 |
| 3 | population_density | 0.031158 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'France'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411710 |
| 9443 | France | 12/26/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411282 |
| 9444 | France | 12/27/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411730 |
| 9445 | France | 12/28/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411813 |
| 9446 | France | 12/29/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411892 |
2132 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9969900485757501
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.06340265752044813 R2 Score: 0.9933988909094517 RMSE: 0.251799 Entropy Value: 0.0037487769082763605
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.718145 |
| 0 | cardiovasc_death_rate | 0.215909 |
| 5 | aged_65_older | 0.026000 |
| 6 | median_age | 0.015482 |
| 2 | female_smokers | 0.013423 |
| 3 | male_smokers | 0.010069 |
| 4 | life_expectancy | 0.000971 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'France'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 5.98 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411710 |
| 9443 | France | 12/26/2022 | 5.98 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411282 |
| 9444 | France | 12/27/2022 | 5.98 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411730 |
| 9445 | France | 12/28/2022 | 5.98 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411813 |
| 9446 | France | 12/29/2022 | 5.98 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411892 |
2132 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9952478754571652
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.054477133587009265 R2 Score: 0.9943281635847481 RMSE: 0.233403 Entropy Value: 0.0035443412739072674
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.685218 |
| 0 | hospital_beds_per_thousand | 0.153925 |
| 2 | gdp_per_capita | 0.080001 |
| 4 | population | 0.063162 |
| 3 | population_density | 0.017695 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 19873 | Latvia | 1/6/2020 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.000000 |
| 19874 | Latvia | 1/18/2020 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.000000 |
| 19875 | Latvia | 2/12/2020 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.000000 |
| 19876 | Latvia | 2/29/2020 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.000000 |
| 19877 | Latvia | 3/1/2020 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2102 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9990157458739676
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.016257964094386432 R2 Score: 0.9985806056081434 RMSE: 0.127507 Entropy Value: 0.0005473259353336738
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.338637 |
| 5 | aged_65_older | 0.301029 |
| 1 | diabetes_prevalence | 0.300169 |
| 3 | male_smokers | 0.022931 |
| 6 | median_age | 0.018653 |
| 2 | female_smokers | 0.018266 |
| 4 | life_expectancy | 0.000316 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 19873 | Latvia | 1/6/2020 | 5.57 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.000000 |
| 19874 | Latvia | 1/18/2020 | 5.57 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.000000 |
| 19875 | Latvia | 2/12/2020 | 5.57 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.000000 |
| 19876 | Latvia | 2/29/2020 | 5.57 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.000000 |
| 19877 | Latvia | 3/1/2020 | 5.57 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2102 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979179222463455
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.027210653455331336 R2 Score: 0.9976243858893387 RMSE: 0.164957 Entropy Value: 0.0013566776347740147
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.481945 |
| 0 | hospital_beds_per_thousand | 0.321784 |
| 3 | population_density | 0.105203 |
| 2 | gdp_per_capita | 0.083330 |
| 4 | population | 0.007738 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovakia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12542 | Slovakia | 12/25/2022 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.783216 |
| 12543 | Slovakia | 12/26/2022 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.783313 |
| 12544 | Slovakia | 12/27/2022 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.783363 |
| 12545 | Slovakia | 12/28/2022 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.783459 |
| 12546 | Slovakia | 12/29/2022 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.783522 |
2063 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9986452711349703
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0011302969685894953 R2 Score: 0.9984342668984687 RMSE: 0.033620 Entropy Value: 0.00025289287267278896
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.666466 |
| 0 | cardiovasc_death_rate | 0.227467 |
| 6 | median_age | 0.077055 |
| 5 | aged_65_older | 0.014932 |
| 2 | female_smokers | 0.011849 |
| 3 | male_smokers | 0.001740 |
| 4 | life_expectancy | 0.000491 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovakia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12542 | Slovakia | 12/25/2022 | 5.82 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.783216 |
| 12543 | Slovakia | 12/26/2022 | 5.82 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.783313 |
| 12544 | Slovakia | 12/27/2022 | 5.82 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.783363 |
| 12545 | Slovakia | 12/28/2022 | 5.82 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.783459 |
| 12546 | Slovakia | 12/29/2022 | 5.82 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.783522 |
2063 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9977287916661088
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.001578803228233158 R2 Score: 0.997812977877545 RMSE: 0.039734 Entropy Value: 0.00035458005898024133
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.790723 |
| 0 | hospital_beds_per_thousand | 0.136952 |
| 2 | gdp_per_capita | 0.044642 |
| 3 | population_density | 0.026271 |
| 4 | population | 0.001412 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Bulgaria'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
2090 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9854266642816312
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005444360169759133 R2 Score: 0.9988853299880451 RMSE: 0.073786 Entropy Value: 0.00033004558394914924
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.606117 |
| 5 | aged_65_older | 0.265091 |
| 0 | cardiovasc_death_rate | 0.074869 |
| 2 | female_smokers | 0.023148 |
| 6 | median_age | 0.022173 |
| 4 | life_expectancy | 0.004642 |
| 3 | male_smokers | 0.003961 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Bulgaria'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.970 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.970 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.970 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.970 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.970 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
2090 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9839372367216079
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.012444501810780155 R2 Score: 0.997452131646388 RMSE: 0.111555 Entropy Value: 0.0007493391098116566
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.691825 |
| 0 | hospital_beds_per_thousand | 0.200318 |
| 2 | gdp_per_capita | 0.056118 |
| 4 | population | 0.035448 |
| 3 | population_density | 0.016291 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
2076 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9986912975875828
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0015132058627513301 R2 Score: 0.9991189988145764 RMSE: 0.038900 Entropy Value: 0.00034236383209695746
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.494640 |
| 5 | aged_65_older | 0.227501 |
| 6 | median_age | 0.139444 |
| 1 | diabetes_prevalence | 0.132259 |
| 2 | female_smokers | 0.005578 |
| 3 | male_smokers | 0.000382 |
| 4 | life_expectancy | 0.000195 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 5.609 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 5.609 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 5.609 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 5.609 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 5.609 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2076 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9969017546192163
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0023159124982087367 R2 Score: 0.9986516562574312 RMSE: 0.048124 Entropy Value: 0.0005495487035360241
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.703587 |
| 1 | extreme_poverty | 0.249542 |
| 2 | gdp_per_capita | 0.030094 |
| 3 | population_density | 0.015271 |
| 4 | population | 0.001506 |
# Country Pair by Pair Analysis relative to extreme poverty
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on extreme poverty (13 pairs of countries)
df_Cyprus = df[(df.location == "Cyprus")]
df_Czechia = df[(df.location == "Czechia")]
df_Finland = df[(df.location == "Finland")]
df_France = df[(df.location == "France")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Switzerland = df[(df.location == "Switzerland")]
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]
df_Estonia = df[(df.location == "Estonia")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Latvia = df[(df.location == "Latvia")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Portugal = df[(df.location == "Portugal")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Italy = df[(df.location == "Italy")]
df_Romania = df[(df.location == "Romania")]
df_Spain = df[(df.location == "Spain")]
df_UnitedStates = df[(df.location == "United States")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Czechia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919575 |
2061 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9968265232047949
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.000978608688705832 R2 Score: 0.9982973886897886 RMSE: 0.031283 Entropy Value: 0.0005033828926016659
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.658933 |
| 0 | cardiovasc_death_rate | 0.184826 |
| 5 | aged_65_older | 0.094490 |
| 6 | median_age | 0.029701 |
| 2 | female_smokers | 0.024293 |
| 3 | male_smokers | 0.006019 |
| 4 | life_expectancy | 0.001737 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Czechia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.887 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.887 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.887 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.887 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.887 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 6.63 | 0.900 | 32605.906 | 137.176 | 10493990 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 6.63 | 0.900 | 32605.906 | 137.176 | 10493990 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 6.63 | 0.900 | 32605.906 | 137.176 | 10493990 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 6.63 | 0.900 | 32605.906 | 137.176 | 10493990 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 6.63 | 0.900 | 32605.906 | 137.176 | 10493990 | 0.919575 |
2061 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9943317261946557
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0027632388218584106 R2 Score: 0.9951924382797649 RMSE: 0.052567 Entropy Value: 0.001055748763209947
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.462078 |
| 0 | hospital_beds_per_thousand | 0.393009 |
| 2 | gdp_per_capita | 0.063780 |
| 4 | population | 0.049975 |
| 3 | population_density | 0.031158 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'France'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7311 | Finland | 1/30/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7312 | Finland | 1/31/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7313 | Finland | 2/1/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7314 | Finland | 2/2/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411710 |
| 9443 | France | 12/26/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411282 |
| 9444 | France | 12/27/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411730 |
| 9445 | France | 12/28/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411813 |
| 9446 | France | 12/29/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411892 |
2137 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.995847722245489
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.025345261586814726 R2 Score: 0.9974928018718409 RMSE: 0.159202 Entropy Value: 0.0013971728366706324
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.435901 |
| 1 | diabetes_prevalence | 0.345667 |
| 5 | aged_65_older | 0.147699 |
| 2 | female_smokers | 0.029630 |
| 6 | median_age | 0.019171 |
| 3 | male_smokers | 0.014463 |
| 4 | life_expectancy | 0.007470 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'France'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 3.28 | 0.938 | 40585.721 | 18.136 | 5540745 | 0.000000 |
| 7311 | Finland | 1/30/2020 | 3.28 | 0.938 | 40585.721 | 18.136 | 5540745 | 0.000000 |
| 7312 | Finland | 1/31/2020 | 3.28 | 0.938 | 40585.721 | 18.136 | 5540745 | 0.000000 |
| 7313 | Finland | 2/1/2020 | 3.28 | 0.938 | 40585.721 | 18.136 | 5540745 | 0.000000 |
| 7314 | Finland | 2/2/2020 | 3.28 | 0.938 | 40585.721 | 18.136 | 5540745 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 5.98 | 0.901 | 38605.671 | 122.578 | 67813000 | 0.411710 |
| 9443 | France | 12/26/2022 | 5.98 | 0.901 | 38605.671 | 122.578 | 67813000 | 0.411282 |
| 9444 | France | 12/27/2022 | 5.98 | 0.901 | 38605.671 | 122.578 | 67813000 | 0.411730 |
| 9445 | France | 12/28/2022 | 5.98 | 0.901 | 38605.671 | 122.578 | 67813000 | 0.411813 |
| 9446 | France | 12/29/2022 | 5.98 | 0.901 | 38605.671 | 122.578 | 67813000 | 0.411892 |
2137 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9953573241288052
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.08019301369568292 R2 Score: 0.9920671651724497 RMSE: 0.283184 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.459088 |
| 4 | population | 0.234622 |
| 0 | hospital_beds_per_thousand | 0.205086 |
| 2 | gdp_per_capita | 0.079619 |
| 3 | population_density | 0.021585 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716205 |
2075 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9994911517097467
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0030060796550120455 R2 Score: 0.9996026740974251 RMSE: 0.054828 Entropy Value: 0.00020234686943402895
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.757461 |
| 1 | diabetes_prevalence | 0.230741 |
| 5 | aged_65_older | 0.005390 |
| 2 | female_smokers | 0.004505 |
| 0 | cardiovasc_death_rate | 0.001455 |
| 3 | male_smokers | 0.000398 |
| 4 | life_expectancy | 0.000050 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Serbia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.320 | 0.944 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.320 | 0.944 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.320 | 0.944 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.320 | 0.944 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.320 | 0.944 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 5.609 | 0.806 | 14048.881 | 80.291 | 6871547 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 5.609 | 0.806 | 14048.881 | 80.291 | 6871547 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 5.609 | 0.806 | 14048.881 | 80.291 | 6871547 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 5.609 | 0.806 | 14048.881 | 80.291 | 6871547 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 5.609 | 0.806 | 14048.881 | 80.291 | 6871547 | 0.716205 |
2075 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.997512391985607
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.017843262238995315 R2 Score: 0.9976415826965301 RMSE: 0.133579 Entropy Value: 0.003495616937415529
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.628833 |
| 2 | gdp_per_capita | 0.207723 |
| 0 | hospital_beds_per_thousand | 0.148684 |
| 3 | population_density | 0.007596 |
| 4 | population | 0.007164 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovenia'
country2 = 'Switzerland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2101 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979540624404055
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0069541682155272394 R2 Score: 0.997542383229531 RMSE: 0.083392 Entropy Value: 0.0005493830796332597
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.588315 |
| 6 | median_age | 0.127966 |
| 0 | cardiovasc_death_rate | 0.102203 |
| 3 | male_smokers | 0.079325 |
| 2 | female_smokers | 0.058231 |
| 5 | aged_65_older | 0.043306 |
| 4 | life_expectancy | 0.000655 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovenia'
country2 = 'Switzerland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 4.53 | 0.955 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 4.53 | 0.955 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 4.53 | 0.955 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 4.53 | 0.955 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 4.53 | 0.955 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2101 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989566850992713
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00732376448420232 R2 Score: 0.9974117671788335 RMSE: 0.085579 Entropy Value: 0.0010900984099785184
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.690075 |
| 2 | gdp_per_capita | 0.192149 |
| 3 | population_density | 0.091684 |
| 0 | hospital_beds_per_thousand | 0.024127 |
| 4 | population | 0.001965 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9985877971766527
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005399476083504289 R2 Score: 0.9995439543541608 RMSE: 0.073481 Entropy Value: 0.0003649161068700861
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.846396 |
| 0 | cardiovasc_death_rate | 0.062219 |
| 1 | diabetes_prevalence | 0.061911 |
| 5 | aged_65_older | 0.025860 |
| 2 | female_smokers | 0.002106 |
| 3 | male_smokers | 0.001366 |
| 4 | life_expectancy | 0.000142 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 5.64 | 0.931 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 5.64 | 0.931 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 5.64 | 0.931 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 5.64 | 0.931 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 5.64 | 0.931 | 42658.576 | 375.564 | 11655923 | 0.711787 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979482681643022
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.017351533220202466 R2 Score: 0.9985344705576373 RMSE: 0.131725 Entropy Value: 0.0015777407587553624
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.700511 |
| 2 | gdp_per_capita | 0.139220 |
| 0 | hospital_beds_per_thousand | 0.123668 |
| 3 | population_density | 0.032006 |
| 4 | population | 0.004595 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2134 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9992606850560544
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0025729522227882153 R2 Score: 0.9993859368044151 RMSE: 0.050724 Entropy Value: 0.00030984264476932686
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.665696 |
| 6 | median_age | 0.173524 |
| 0 | cardiovasc_death_rate | 0.127952 |
| 5 | aged_65_older | 0.019835 |
| 2 | female_smokers | 0.011763 |
| 3 | male_smokers | 0.001135 |
| 4 | life_expectancy | 0.000095 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.5 | 0.940 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 2.5 | 0.940 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 2.5 | 0.940 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 2.5 | 0.940 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 2.5 | 0.940 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.5 | 0.929 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.5 | 0.929 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.5 | 0.929 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.5 | 0.929 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.5 | 0.929 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2134 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9981308764210667
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006611715252313853 R2 Score: 0.998422041824106 RMSE: 0.081312 Entropy Value: 0.001601041202106293
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.723446 |
| 0 | hospital_beds_per_thousand | 0.146505 |
| 2 | gdp_per_capita | 0.065728 |
| 3 | population_density | 0.048554 |
| 4 | population | 0.015767 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
2097 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988310609150656
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0015892974905094047 R2 Score: 0.997047995814913 RMSE: 0.039866 Entropy Value: 0.0010983888612615478
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.544620 |
| 1 | diabetes_prevalence | 0.367091 |
| 6 | median_age | 0.053979 |
| 5 | aged_65_older | 0.021297 |
| 2 | female_smokers | 0.012030 |
| 3 | male_smokers | 0.000796 |
| 4 | life_expectancy | 0.000188 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Iceland'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.892 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.892 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.892 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.892 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.892 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 46482.958 | 3.404 | 372903 | 0.11011 |
2097 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984739118394756
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0013145855112608861 R2 Score: 0.9975582532823021 RMSE: 0.036257 Entropy Value: 0.0008164640345889426
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.646961 |
| 0 | hospital_beds_per_thousand | 0.271199 |
| 2 | gdp_per_capita | 0.066857 |
| 3 | population_density | 0.013454 |
| 4 | population | 0.001528 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2073 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9986904056870012
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0019402535771077944 R2 Score: 0.9991197380127789 RMSE: 0.044048 Entropy Value: 0.00025487866277611697
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.403544 |
| 5 | aged_65_older | 0.288725 |
| 0 | cardiovasc_death_rate | 0.277344 |
| 2 | female_smokers | 0.021881 |
| 6 | median_age | 0.006558 |
| 3 | male_smokers | 0.001584 |
| 4 | life_expectancy | 0.000364 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 2.96 | 0.955 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 2.96 | 0.955 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 2.96 | 0.955 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 2.96 | 0.955 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 2.96 | 0.955 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2073 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979185512625015
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006417548514505037 R2 Score: 0.9970884609748346 RMSE: 0.080110 Entropy Value: 0.0010836828796983616
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.727506 |
| 0 | hospital_beds_per_thousand | 0.137840 |
| 2 | gdp_per_capita | 0.055472 |
| 3 | population_density | 0.045744 |
| 4 | population | 0.033436 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Portugal'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
2075 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9977076018554133
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.001019000062480644 R2 Score: 0.9987690062547512 RMSE: 0.031922 Entropy Value: 0.0002762300078945429
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.832116 |
| 0 | cardiovasc_death_rate | 0.121544 |
| 5 | aged_65_older | 0.015343 |
| 2 | female_smokers | 0.015126 |
| 6 | median_age | 0.012136 |
| 3 | male_smokers | 0.002974 |
| 4 | life_expectancy | 0.000761 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Portugal'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 4.51 | 0.916 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 4.51 | 0.916 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 4.51 | 0.916 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 4.51 | 0.916 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 4.51 | 0.916 | 94277.965 | 231.447 | 647601 | 0.377872 |
2075 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9962570390993097
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002469659339838852 R2 Score: 0.9970165505261736 RMSE: 0.049696 Entropy Value: 0.0009983817597009118
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.645148 |
| 0 | hospital_beds_per_thousand | 0.185715 |
| 2 | gdp_per_capita | 0.102804 |
| 3 | population_density | 0.059981 |
| 4 | population | 0.006353 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Sweden'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.816005 |
2092 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9981093768435327
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.013333842213367515 R2 Score: 0.9971199527942521 RMSE: 0.115472 Entropy Value: 0.001124812762035393
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.652631 |
| 0 | cardiovasc_death_rate | 0.279490 |
| 6 | median_age | 0.042922 |
| 5 | aged_65_older | 0.017388 |
| 2 | female_smokers | 0.004467 |
| 3 | male_smokers | 0.002397 |
| 4 | life_expectancy | 0.000704 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Sweden'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.82 | 0.860 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.82 | 0.860 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.82 | 0.860 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.82 | 0.860 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.82 | 0.860 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2092 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9966554266113448
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.02913709652628138 R2 Score: 0.9937065241892549 RMSE: 0.170696 Entropy Value: 0.003420938019185575
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.583039 |
| 2 | gdp_per_capita | 0.348568 |
| 0 | hospital_beds_per_thousand | 0.037253 |
| 4 | population | 0.018961 |
| 3 | population_density | 0.012179 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'Bulgaria'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13605 | United Kingdom | 12/25/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13606 | United Kingdom | 12/26/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13607 | United Kingdom | 12/27/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13608 | United Kingdom | 12/28/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13609 | United Kingdom | 12/29/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
2090 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9362768989166241
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.9616768347996019 R2 Score: 0.9502585480128497 RMSE: 0.980651 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.468417 |
| 5 | aged_65_older | 0.317984 |
| 4 | life_expectancy | 0.071588 |
| 6 | median_age | 0.051958 |
| 2 | female_smokers | 0.041580 |
| 0 | cardiovasc_death_rate | 0.037815 |
| 3 | male_smokers | 0.010659 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'Bulgaria'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13605 | United Kingdom | 12/25/2022 | 2.540 | 0.932 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13606 | United Kingdom | 12/26/2022 | 2.540 | 0.932 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13607 | United Kingdom | 12/27/2022 | 2.540 | 0.932 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13608 | United Kingdom | 12/28/2022 | 2.540 | 0.932 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13609 | United Kingdom | 12/29/2022 | 2.540 | 0.932 | 39753.244 | 272.898 | 67508936 | 0.883564 |
2090 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9336897557962663
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.46495768148618166 R2 Score: 0.9759506838962997 RMSE: 0.681878 Entropy Value: 0.006551530873100116
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.608164 |
| 4 | population | 0.227605 |
| 0 | hospital_beds_per_thousand | 0.068975 |
| 2 | gdp_per_capita | 0.063380 |
| 3 | population_density | 0.031876 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Romania'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 17800 | Romania | 2/26/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17801 | Romania | 2/27/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17802 | Romania | 2/28/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17803 | Romania | 2/29/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17804 | Romania | 3/1/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2102 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991769240906299
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005430612503529626 R2 Score: 0.999481950529423 RMSE: 0.073693 Entropy Value: 0.00018327219830469819
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.498915 |
| 0 | cardiovasc_death_rate | 0.238504 |
| 5 | aged_65_older | 0.181379 |
| 6 | median_age | 0.065676 |
| 2 | female_smokers | 0.012879 |
| 3 | male_smokers | 0.002292 |
| 4 | life_expectancy | 0.000355 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Romania'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 17800 | Romania | 2/26/2020 | 6.892 | 0.828 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17801 | Romania | 2/27/2020 | 6.892 | 0.828 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17802 | Romania | 2/28/2020 | 6.892 | 0.828 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17803 | Romania | 2/29/2020 | 6.892 | 0.828 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17804 | Romania | 3/1/2020 | 6.892 | 0.828 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.180 | 0.892 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.180 | 0.892 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.180 | 0.892 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.180 | 0.892 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.180 | 0.892 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2102 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9973047338698764
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.025677028648985536 R2 Score: 0.9975505578626812 RMSE: 0.160241 Entropy Value: 0.0009073480150924102
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.613296 |
| 0 | hospital_beds_per_thousand | 0.157358 |
| 3 | population_density | 0.143481 |
| 2 | gdp_per_capita | 0.078615 |
| 4 | population | 0.007250 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2136 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9982647069016395
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.06052287316456901 R2 Score: 0.9898126721885138 RMSE: 0.246014 Entropy Value: 0.0021185863451388767
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.803399 |
| 0 | cardiovasc_death_rate | 0.129064 |
| 5 | aged_65_older | 0.034493 |
| 2 | female_smokers | 0.014970 |
| 3 | male_smokers | 0.013636 |
| 6 | median_age | 0.004311 |
| 4 | life_expectancy | 0.000126 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 2.97 | 0.904 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 2.97 | 0.904 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 2.97 | 0.904 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 2.97 | 0.904 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 2.97 | 0.904 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2136 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9974418277786332
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0702219546964368 R2 Score: 0.988180103906985 RMSE: 0.264994 Entropy Value: 0.0028392443335123986
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.588371 |
| 4 | population | 0.187305 |
| 0 | hospital_beds_per_thousand | 0.093021 |
| 2 | gdp_per_capita | 0.077402 |
| 3 | population_density | 0.053900 |
# Country Pair by Pair Analysis relative to gdp_per_capita
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on gdp_per_capita (13 pairs of countries)
df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedStates = df[(df.location == "United States")]
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_France = df[(df.location == "France")]
df_Iceland = df[(df.location == "Iceland")]
df_Italy = df[(df.location == "Italy")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Czechia = df[(df.location == "Czechia")]
df_Estonia = df[(df.location == "Estonia")]
df_Latvia = df[(df.location == "Latvia")]
df_Portugal = df[(df.location == "Portugal")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Spain = df[(df.location == "Spain")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
2076 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987521060977691
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0015514128412379164 R2 Score: 0.9993205997164117 RMSE: 0.039388 Entropy Value: 0.0003261513916954156
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.681071 |
| 5 | aged_65_older | 0.213986 |
| 0 | cardiovasc_death_rate | 0.084737 |
| 1 | diabetes_prevalence | 0.013333 |
| 2 | female_smokers | 0.006390 |
| 3 | male_smokers | 0.000304 |
| 4 | life_expectancy | 0.000178 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 4.51 | 0.916 | 0.2 | 231.447 | 647601 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 4.51 | 0.916 | 0.2 | 231.447 | 647601 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 4.51 | 0.916 | 0.2 | 231.447 | 647601 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 4.51 | 0.916 | 0.2 | 231.447 | 647601 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 4.51 | 0.916 | 0.2 | 231.447 | 647601 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 2.96 | 0.955 | 0.2 | 69.874 | 5023108 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 2.96 | 0.955 | 0.2 | 69.874 | 5023108 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 2.96 | 0.955 | 0.2 | 69.874 | 5023108 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 2.96 | 0.955 | 0.2 | 69.874 | 5023108 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 2.96 | 0.955 | 0.2 | 69.874 | 5023108 | 0.491388 |
2076 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9951141561453272
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00468197321349643 R2 Score: 0.9979496534742719 RMSE: 0.068425 Entropy Value: 0.0012251208567780243
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.741234 |
| 0 | hospital_beds_per_thousand | 0.114263 |
| 2 | extreme_poverty | 0.070959 |
| 4 | population | 0.047536 |
| 3 | population_density | 0.026008 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2112 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9866816704098931
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004320498645038054 R2 Score: 0.9979842946002436 RMSE: 0.065731 Entropy Value: 0.0005109868873380628
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.810118 |
| 6 | median_age | 0.055796 |
| 2 | female_smokers | 0.052323 |
| 4 | life_expectancy | 0.045962 |
| 0 | cardiovasc_death_rate | 0.014979 |
| 5 | aged_65_older | 0.013223 |
| 3 | male_smokers | 0.007598 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United States'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 4.53 | 0.955 | 0.03 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 4.53 | 0.955 | 0.03 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 4.53 | 0.955 | 0.03 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 4.53 | 0.955 | 0.03 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 4.53 | 0.955 | 0.03 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.20 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.20 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.20 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.20 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.20 | 35.608 | 338289856 | 1.084791 |
2112 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9858714493338189
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.013696625921283593 R2 Score: 0.9936099128604793 RMSE: 0.117033 Entropy Value: 0.0009584504854066147
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.854932 |
| 2 | extreme_poverty | 0.067287 |
| 4 | population | 0.042602 |
| 3 | population_density | 0.022960 |
| 0 | hospital_beds_per_thousand | 0.012219 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9985877971766527
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005399476083504289 R2 Score: 0.9995439543541608 RMSE: 0.073481 Entropy Value: 0.0003649161068700861
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.846396 |
| 0 | cardiovasc_death_rate | 0.062219 |
| 1 | diabetes_prevalence | 0.061911 |
| 5 | aged_65_older | 0.025860 |
| 2 | female_smokers | 0.002106 |
| 3 | male_smokers | 0.001366 |
| 4 | life_expectancy | 0.000142 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 5.64 | 0.931 | 0.2 | 375.564 | 11655923 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 5.64 | 0.931 | 0.2 | 375.564 | 11655923 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 5.64 | 0.931 | 0.2 | 375.564 | 11655923 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 5.64 | 0.931 | 0.2 | 375.564 | 11655923 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 5.64 | 0.931 | 0.2 | 375.564 | 11655923 | 0.711787 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979482681643022
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.017351533220202466 R2 Score: 0.9985344705576373 RMSE: 0.131725 Entropy Value: 0.0015777407587553624
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.700511 |
| 2 | extreme_poverty | 0.139220 |
| 0 | hospital_beds_per_thousand | 0.123668 |
| 3 | population_density | 0.032006 |
| 4 | population | 0.004595 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2134 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9992606850560544
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0025729522227882153 R2 Score: 0.9993859368044151 RMSE: 0.050724 Entropy Value: 0.00030984264476932686
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.665696 |
| 6 | median_age | 0.173524 |
| 0 | cardiovasc_death_rate | 0.127952 |
| 5 | aged_65_older | 0.019835 |
| 2 | female_smokers | 0.011763 |
| 3 | male_smokers | 0.001135 |
| 4 | life_expectancy | 0.000095 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.5 | 0.940 | 0.2 | 136.520 | 5882259 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 2.5 | 0.940 | 0.2 | 136.520 | 5882259 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 2.5 | 0.940 | 0.2 | 136.520 | 5882259 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 2.5 | 0.940 | 0.2 | 136.520 | 5882259 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 2.5 | 0.940 | 0.2 | 136.520 | 5882259 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.5 | 0.929 | 0.5 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.5 | 0.929 | 0.5 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.5 | 0.929 | 0.5 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.5 | 0.929 | 0.5 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.5 | 0.929 | 0.5 | 4.037 | 38454328 | 1.093162 |
2134 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9981308764210667
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006611715252313853 R2 Score: 0.998422041824106 RMSE: 0.081312 Entropy Value: 0.001601041202106293
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.723446 |
| 0 | hospital_beds_per_thousand | 0.146505 |
| 2 | extreme_poverty | 0.065728 |
| 3 | population_density | 0.048554 |
| 4 | population | 0.015767 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'France'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7311 | Finland | 1/30/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7312 | Finland | 1/31/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7313 | Finland | 2/1/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7314 | Finland | 2/2/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411710 |
| 9443 | France | 12/26/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411282 |
| 9444 | France | 12/27/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411730 |
| 9445 | France | 12/28/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411813 |
| 9446 | France | 12/29/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411892 |
2137 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.995847722245489
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.025345261586814726 R2 Score: 0.9974928018718409 RMSE: 0.159202 Entropy Value: 0.0013971728366706324
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.435901 |
| 1 | diabetes_prevalence | 0.345667 |
| 5 | aged_65_older | 0.147699 |
| 2 | female_smokers | 0.029630 |
| 6 | median_age | 0.019171 |
| 3 | male_smokers | 0.014463 |
| 4 | life_expectancy | 0.007470 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'France'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 3.28 | 0.938 | 0.04 | 18.136 | 5540745 | 0.000000 |
| 7311 | Finland | 1/30/2020 | 3.28 | 0.938 | 0.04 | 18.136 | 5540745 | 0.000000 |
| 7312 | Finland | 1/31/2020 | 3.28 | 0.938 | 0.04 | 18.136 | 5540745 | 0.000000 |
| 7313 | Finland | 2/1/2020 | 3.28 | 0.938 | 0.04 | 18.136 | 5540745 | 0.000000 |
| 7314 | Finland | 2/2/2020 | 3.28 | 0.938 | 0.04 | 18.136 | 5540745 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 5.98 | 0.901 | 0.02 | 122.578 | 67813000 | 0.411710 |
| 9443 | France | 12/26/2022 | 5.98 | 0.901 | 0.02 | 122.578 | 67813000 | 0.411282 |
| 9444 | France | 12/27/2022 | 5.98 | 0.901 | 0.02 | 122.578 | 67813000 | 0.411730 |
| 9445 | France | 12/28/2022 | 5.98 | 0.901 | 0.02 | 122.578 | 67813000 | 0.411813 |
| 9446 | France | 12/29/2022 | 5.98 | 0.901 | 0.02 | 122.578 | 67813000 | 0.411892 |
2137 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9953573241288052
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.08019301369568292 R2 Score: 0.9920671651724497 RMSE: 0.283184 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.459088 |
| 4 | population | 0.234622 |
| 0 | hospital_beds_per_thousand | 0.205086 |
| 2 | extreme_poverty | 0.079619 |
| 3 | population_density | 0.021585 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Iceland'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 20911 | Iceland | 2/28/2020 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20912 | Iceland | 2/29/2020 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20913 | Iceland | 3/1/2020 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20914 | Iceland | 3/2/2020 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20915 | Iceland | 3/3/2020 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2100 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9995037277358563
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.012240815117304085 R2 Score: 0.9989962218220662 RMSE: 0.110638 Entropy Value: 0.0007257267656131561
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.468617 |
| 6 | median_age | 0.310603 |
| 1 | diabetes_prevalence | 0.184342 |
| 2 | female_smokers | 0.019198 |
| 5 | aged_65_older | 0.017014 |
| 3 | male_smokers | 0.000171 |
| 4 | life_expectancy | 0.000056 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Iceland'
country2 = 'Italy'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 20911 | Iceland | 2/28/2020 | 2.91 | 0.949 | 0.2 | 3.404 | 372903 | 0.000000 |
| 20912 | Iceland | 2/29/2020 | 2.91 | 0.949 | 0.2 | 3.404 | 372903 | 0.000000 |
| 20913 | Iceland | 3/1/2020 | 2.91 | 0.949 | 0.2 | 3.404 | 372903 | 0.000000 |
| 20914 | Iceland | 3/2/2020 | 2.91 | 0.949 | 0.2 | 3.404 | 372903 | 0.000000 |
| 20915 | Iceland | 3/3/2020 | 2.91 | 0.949 | 0.2 | 3.404 | 372903 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 205.859 | 59037472 | 0.735109 |
2100 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9992536802375411
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.03734838401444865 R2 Score: 0.9969373368933739 RMSE: 0.193257 Entropy Value: 0.0026011322728448083
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.374572 |
| 1 | human_development_index | 0.351103 |
| 3 | population_density | 0.183497 |
| 2 | extreme_poverty | 0.088126 |
| 4 | population | 0.002702 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Sweden'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.816005 |
2100 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9974557008389675
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009632293525443848 R2 Score: 0.9990787800266875 RMSE: 0.098144 Entropy Value: 0.0005803548472550031
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.788294 |
| 2 | female_smokers | 0.150990 |
| 6 | median_age | 0.023584 |
| 0 | cardiovasc_death_rate | 0.017991 |
| 3 | male_smokers | 0.017966 |
| 5 | aged_65_older | 0.000835 |
| 4 | life_expectancy | 0.000340 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Sweden'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.5 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.5 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.5 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.5 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.5 | 24.718 | 10549349 | 0.816005 |
2100 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990940063827581
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.021788176538703085 R2 Score: 0.9979162072504859 RMSE: 0.147608 Entropy Value: 0.0017671236634175858
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.618477 |
| 2 | extreme_poverty | 0.277463 |
| 0 | hospital_beds_per_thousand | 0.066718 |
| 3 | population_density | 0.034665 |
| 4 | population | 0.002677 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'Bulgaria'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13605 | United Kingdom | 12/25/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13606 | United Kingdom | 12/26/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13607 | United Kingdom | 12/27/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13608 | United Kingdom | 12/28/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13609 | United Kingdom | 12/29/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
2090 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9362768989166241
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.9616768347996019 R2 Score: 0.9502585480128497 RMSE: 0.980651 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.468417 |
| 5 | aged_65_older | 0.317984 |
| 4 | life_expectancy | 0.071588 |
| 6 | median_age | 0.051958 |
| 2 | female_smokers | 0.041580 |
| 0 | cardiovasc_death_rate | 0.037815 |
| 3 | male_smokers | 0.010659 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'Bulgaria'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.5 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.5 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.5 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.5 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.5 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13605 | United Kingdom | 12/25/2022 | 2.540 | 0.932 | 0.2 | 272.898 | 67508936 | 0.883564 |
| 13606 | United Kingdom | 12/26/2022 | 2.540 | 0.932 | 0.2 | 272.898 | 67508936 | 0.883564 |
| 13607 | United Kingdom | 12/27/2022 | 2.540 | 0.932 | 0.2 | 272.898 | 67508936 | 0.883564 |
| 13608 | United Kingdom | 12/28/2022 | 2.540 | 0.932 | 0.2 | 272.898 | 67508936 | 0.883564 |
| 13609 | United Kingdom | 12/29/2022 | 2.540 | 0.932 | 0.2 | 272.898 | 67508936 | 0.883564 |
2090 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9336897557962663
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.46495768148618166 R2 Score: 0.9759506838962997 RMSE: 0.681878 Entropy Value: 0.006551530873100116
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.608164 |
| 4 | population | 0.227605 |
| 0 | hospital_beds_per_thousand | 0.068975 |
| 2 | extreme_poverty | 0.063380 |
| 3 | population_density | 0.031876 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Czechia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919575 |
2061 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9968265232047949
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.000978608688705832 R2 Score: 0.9982973886897886 RMSE: 0.031283 Entropy Value: 0.0005033828926016659
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.658933 |
| 0 | cardiovasc_death_rate | 0.184826 |
| 5 | aged_65_older | 0.094490 |
| 6 | median_age | 0.029701 |
| 2 | female_smokers | 0.024293 |
| 3 | male_smokers | 0.006019 |
| 4 | life_expectancy | 0.001737 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Czechia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.887 | 0.15 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.887 | 0.15 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.887 | 0.15 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.887 | 0.15 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.887 | 0.15 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 6.63 | 0.900 | 0.00 | 137.176 | 10493990 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 6.63 | 0.900 | 0.00 | 137.176 | 10493990 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 6.63 | 0.900 | 0.00 | 137.176 | 10493990 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 6.63 | 0.900 | 0.00 | 137.176 | 10493990 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 6.63 | 0.900 | 0.00 | 137.176 | 10493990 | 0.919575 |
2061 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9943317261946557
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0027632388218584106 R2 Score: 0.9951924382797649 RMSE: 0.052567 Entropy Value: 0.001055748763209947
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.462078 |
| 0 | hospital_beds_per_thousand | 0.393009 |
| 2 | extreme_poverty | 0.063780 |
| 4 | population | 0.049975 |
| 3 | population_density | 0.031158 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9983509839471967
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0011442225986659504 R2 Score: 0.998102656857225 RMSE: 0.033826 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.764753 |
| 0 | cardiovasc_death_rate | 0.126479 |
| 5 | aged_65_older | 0.055045 |
| 6 | median_age | 0.033800 |
| 2 | female_smokers | 0.018890 |
| 3 | male_smokers | 0.000750 |
| 4 | life_expectancy | 0.000282 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Latvia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.892 | 0.5 | 31.033 | 1326064 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.892 | 0.5 | 31.033 | 1326064 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.892 | 0.5 | 31.033 | 1326064 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.892 | 0.5 | 31.033 | 1326064 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.892 | 0.5 | 31.033 | 1326064 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 0.7 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 0.7 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 0.7 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 0.7 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 0.7 | 31.212 | 1850654 | 0.631969 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9974510837613941
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0018480402171181661 R2 Score: 0.9969355906467765 RMSE: 0.042989 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.772094 |
| 3 | population_density | 0.102371 |
| 0 | hospital_beds_per_thousand | 0.068553 |
| 2 | extreme_poverty | 0.051827 |
| 4 | population | 0.005155 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Romania'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
2072 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988248313239702
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0022348534753311943 R2 Score: 0.9986910224301357 RMSE: 0.047274 Entropy Value: 0.0003035979830766401
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.525057 |
| 0 | cardiovasc_death_rate | 0.282578 |
| 5 | aged_65_older | 0.132401 |
| 6 | median_age | 0.035947 |
| 2 | female_smokers | 0.019500 |
| 3 | male_smokers | 0.003542 |
| 4 | life_expectancy | 0.000975 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Romania'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.390 | 0.864 | 0.5 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.390 | 0.864 | 0.5 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.390 | 0.864 | 0.5 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.390 | 0.864 | 0.5 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.390 | 0.864 | 0.5 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.7 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.7 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.7 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.7 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.7 | 85.129 | 19659270 | 2.036403 |
2072 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9981547439951199
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003836200339738359 R2 Score: 0.9977530964541294 RMSE: 0.061937 Entropy Value: 0.000381309516154465
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.584363 |
| 0 | hospital_beds_per_thousand | 0.321504 |
| 2 | extreme_poverty | 0.054565 |
| 3 | population_density | 0.035218 |
| 4 | population | 0.004351 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovakia'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716205 |
2067 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9962453859478295
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.000787005226820817 R2 Score: 0.9967791971622308 RMSE: 0.028054 Entropy Value: inf
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.825131 |
| 6 | median_age | 0.119560 |
| 5 | aged_65_older | 0.036567 |
| 2 | female_smokers | 0.013630 |
| 3 | male_smokers | 0.001937 |
| 4 | life_expectancy | 0.001725 |
| 0 | cardiovasc_death_rate | 0.001451 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovakia'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.820 | 0.860 | 0.70 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.820 | 0.860 | 0.70 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.820 | 0.860 | 0.70 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.820 | 0.860 | 0.70 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.820 | 0.860 | 0.70 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 5.609 | 0.806 | 0.05 | 80.291 | 6871547 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 5.609 | 0.806 | 0.05 | 80.291 | 6871547 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 5.609 | 0.806 | 0.05 | 80.291 | 6871547 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 5.609 | 0.806 | 0.05 | 80.291 | 6871547 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 5.609 | 0.806 | 0.05 | 80.291 | 6871547 | 0.716205 |
2067 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9935786810447313
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0014413661864962608 R2 Score: 0.9941012382821331 RMSE: 0.037965 Entropy Value: 0.0008811405071239758
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.571363 |
| 0 | hospital_beds_per_thousand | 0.291424 |
| 2 | extreme_poverty | 0.051676 |
| 4 | population | 0.046745 |
| 3 | population_density | 0.038791 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovenia'
country2 = 'Spain'
# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2125 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989690618121256
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0060921214023629666 R2 Score: 0.9990656666044356 RMSE: 0.078052 Entropy Value: 0.000580086181770288
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.754377 |
| 0 | cardiovasc_death_rate | 0.120936 |
| 6 | median_age | 0.072379 |
| 2 | female_smokers | 0.030701 |
| 5 | aged_65_older | 0.018044 |
| 3 | male_smokers | 0.003379 |
| 4 | life_expectancy | 0.000184 |
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovenia'
country2 = 'Spain'
# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 2.97 | 0.904 | 1.0 | 93.105 | 47558632 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 2.97 | 0.904 | 1.0 | 93.105 | 47558632 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 2.97 | 0.904 | 1.0 | 93.105 | 47558632 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 2.97 | 0.904 | 1.0 | 93.105 | 47558632 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 2.97 | 0.904 | 1.0 | 93.105 | 47558632 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 102.619 | 2119843 | 0.536669 |
2125 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Define XGBoost model
xgb_model = xgb.XGBRegressor()
# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'gamma': [0, 0.1, 0.2],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]}
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9986016344611052
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
# Making Predictions
y_pred = best_model.predict(X_test_scaled)
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01192771638114138 R2 Score: 0.9981706760237249 RMSE: 0.109214 Entropy Value: 0.0014002886806236585
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.726845 |
| 2 | extreme_poverty | 0.142845 |
| 0 | hospital_beds_per_thousand | 0.091830 |
| 3 | population_density | 0.030780 |
| 4 | population | 0.007701 |